{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('./data/gender-classifier-DFE-791531.csv'\n",
    "                   ,header = 0\n",
    "                   ,error_bad_lines=False\n",
    "                   ,encoding='latin1'\n",
    "                   ,skip_blank_lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "# _unit_id(唯一ID): a unique id for user\n",
    "# _golden(黄金标准，2值): whether the user was included in the gold standard for the model; TRUE or FALSE\n",
    "# _unit_state(检测状态，2值): state of the observation; one of finalized (for contributor-judged) or golden (for gold standard observations)\n",
    "# _trusted_judgments(可信的评论数量，连续值): number of trusted judgments (int); always 3 for non-golden, and what may be a unique id for gold standard observations\n",
    "# _last_judgment_at(最后评论时间): date and time of last contributor judgment; blank for gold standard observations\n",
    "# gender(性别): one of male, female, or brand (for non-human profiles)\n",
    "# gender:confidence(性别的可信度，置信区间): a float representing confidence in the provided gender\n",
    "# profile_yn(no代表是资料收集，非预测): \"no\" here seems to mean that the profile was meant to be part of the dataset but was not available when contributors went to judge it\n",
    "# profile_yn:confidence(profile_yn置信取间): confidence in the existence/non-existence of the profile\n",
    "# created(用户创建时间): date and time when the profile was created\n",
    "# description(用户描述): the user's profile description\n",
    "# fav_number(关注人数): number of tweets the user has favorited\n",
    "# gender_gold(性别 黄金？): if the profile is golden, what is the gender?\n",
    "# link_color(十六进制，link颜色): the link color on the profile, as a hex value\n",
    "# name(名称): the user's name\n",
    "# profile_yn_gold(二值): whether the profile y/n value is golden\n",
    "# profileimage（头像）: a link to the profile image\n",
    "# retweet_count(被关注次数): number of times the user has retweeted (or possibly, been retweeted)\n",
    "# sidebar_color(边框颜色): color of the profile sidebar, as a hex value\n",
    "# text(随机抽取的tweets文本): text of a random one of the user's tweets\n",
    "# tweet_coord(如果用户开启了定位，则显示经纬度): if the user has location turned on, the coordinates as a string with the format \"[latitude, longitude]\"\n",
    "# tweet_count(发布的tweet数): number of tweets that the user has posted\n",
    "# tweet_created(随机抽取的tweet创建时间): when the random tweet (in the text column) was created\n",
    "# tweet_id(随机抽取的tweet的id): the tweet id of the random tweet\n",
    "# tweet_location(tweet的定位，没有做规范化处理): location of the tweet; seems to not be particularly normalized\n",
    "# user_timezone(用户的时区): the timezone of the user"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>_unit_id</th>\n",
       "      <th>_golden</th>\n",
       "      <th>_unit_state</th>\n",
       "      <th>_trusted_judgments</th>\n",
       "      <th>_last_judgment_at</th>\n",
       "      <th>gender</th>\n",
       "      <th>gender:confidence</th>\n",
       "      <th>profile_yn</th>\n",
       "      <th>profile_yn:confidence</th>\n",
       "      <th>created</th>\n",
       "      <th>...</th>\n",
       "      <th>profileimage</th>\n",
       "      <th>retweet_count</th>\n",
       "      <th>sidebar_color</th>\n",
       "      <th>text</th>\n",
       "      <th>tweet_coord</th>\n",
       "      <th>tweet_count</th>\n",
       "      <th>tweet_created</th>\n",
       "      <th>tweet_id</th>\n",
       "      <th>tweet_location</th>\n",
       "      <th>user_timezone</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>815719226</td>\n",
       "      <td>False</td>\n",
       "      <td>finalized</td>\n",
       "      <td>3</td>\n",
       "      <td>10/26/15 23:24</td>\n",
       "      <td>male</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>yes</td>\n",
       "      <td>1.0</td>\n",
       "      <td>12/5/13 1:48</td>\n",
       "      <td>...</td>\n",
       "      <td>https://pbs.twimg.com/profile_images/414342229...</td>\n",
       "      <td>0</td>\n",
       "      <td>FFFFFF</td>\n",
       "      <td>Robbie E Responds To Critics After Win Against...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>110964</td>\n",
       "      <td>10/26/15 12:40</td>\n",
       "      <td>6.587300e+17</td>\n",
       "      <td>main; @Kan1shk3</td>\n",
       "      <td>Chennai</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>815719227</td>\n",
       "      <td>False</td>\n",
       "      <td>finalized</td>\n",
       "      <td>3</td>\n",
       "      <td>10/26/15 23:30</td>\n",
       "      <td>male</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>yes</td>\n",
       "      <td>1.0</td>\n",
       "      <td>10/1/12 13:51</td>\n",
       "      <td>...</td>\n",
       "      <td>https://pbs.twimg.com/profile_images/539604221...</td>\n",
       "      <td>0</td>\n",
       "      <td>C0DEED</td>\n",
       "      <td>ÛÏIt felt like they were my friends and I was...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7471</td>\n",
       "      <td>10/26/15 12:40</td>\n",
       "      <td>6.587300e+17</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Eastern Time (US &amp; Canada)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>815719228</td>\n",
       "      <td>False</td>\n",
       "      <td>finalized</td>\n",
       "      <td>3</td>\n",
       "      <td>10/26/15 23:33</td>\n",
       "      <td>male</td>\n",
       "      <td>0.6625</td>\n",
       "      <td>yes</td>\n",
       "      <td>1.0</td>\n",
       "      <td>11/28/14 11:30</td>\n",
       "      <td>...</td>\n",
       "      <td>https://pbs.twimg.com/profile_images/657330418...</td>\n",
       "      <td>1</td>\n",
       "      <td>C0DEED</td>\n",
       "      <td>i absolutely adore when louis starts the songs...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5617</td>\n",
       "      <td>10/26/15 12:40</td>\n",
       "      <td>6.587300e+17</td>\n",
       "      <td>clcncl</td>\n",
       "      <td>Belgrade</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>815719229</td>\n",
       "      <td>False</td>\n",
       "      <td>finalized</td>\n",
       "      <td>3</td>\n",
       "      <td>10/26/15 23:10</td>\n",
       "      <td>male</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>yes</td>\n",
       "      <td>1.0</td>\n",
       "      <td>6/11/09 22:39</td>\n",
       "      <td>...</td>\n",
       "      <td>https://pbs.twimg.com/profile_images/259703936...</td>\n",
       "      <td>0</td>\n",
       "      <td>C0DEED</td>\n",
       "      <td>Hi @JordanSpieth - Looking at the url - do you...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1693</td>\n",
       "      <td>10/26/15 12:40</td>\n",
       "      <td>6.587300e+17</td>\n",
       "      <td>Palo Alto, CA</td>\n",
       "      <td>Pacific Time (US &amp; Canada)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>815719230</td>\n",
       "      <td>False</td>\n",
       "      <td>finalized</td>\n",
       "      <td>3</td>\n",
       "      <td>10/27/15 1:15</td>\n",
       "      <td>female</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>yes</td>\n",
       "      <td>1.0</td>\n",
       "      <td>4/16/14 13:23</td>\n",
       "      <td>...</td>\n",
       "      <td>https://pbs.twimg.com/profile_images/564094871...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Watching Neighbours on Sky+ catching up with t...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>31462</td>\n",
       "      <td>10/26/15 12:40</td>\n",
       "      <td>6.587300e+17</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 26 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    _unit_id  _golden _unit_state  _trusted_judgments _last_judgment_at  \\\n",
       "0  815719226    False   finalized                   3    10/26/15 23:24   \n",
       "1  815719227    False   finalized                   3    10/26/15 23:30   \n",
       "2  815719228    False   finalized                   3    10/26/15 23:33   \n",
       "3  815719229    False   finalized                   3    10/26/15 23:10   \n",
       "4  815719230    False   finalized                   3     10/27/15 1:15   \n",
       "\n",
       "   gender  gender:confidence profile_yn  profile_yn:confidence  \\\n",
       "0    male             1.0000        yes                    1.0   \n",
       "1    male             1.0000        yes                    1.0   \n",
       "2    male             0.6625        yes                    1.0   \n",
       "3    male             1.0000        yes                    1.0   \n",
       "4  female             1.0000        yes                    1.0   \n",
       "\n",
       "          created             ...              \\\n",
       "0    12/5/13 1:48             ...               \n",
       "1   10/1/12 13:51             ...               \n",
       "2  11/28/14 11:30             ...               \n",
       "3   6/11/09 22:39             ...               \n",
       "4   4/16/14 13:23             ...               \n",
       "\n",
       "                                        profileimage  retweet_count  \\\n",
       "0  https://pbs.twimg.com/profile_images/414342229...              0   \n",
       "1  https://pbs.twimg.com/profile_images/539604221...              0   \n",
       "2  https://pbs.twimg.com/profile_images/657330418...              1   \n",
       "3  https://pbs.twimg.com/profile_images/259703936...              0   \n",
       "4  https://pbs.twimg.com/profile_images/564094871...              0   \n",
       "\n",
       "  sidebar_color                                               text  \\\n",
       "0        FFFFFF  Robbie E Responds To Critics After Win Against...   \n",
       "1        C0DEED  ÛÏIt felt like they were my friends and I was...   \n",
       "2        C0DEED  i absolutely adore when louis starts the songs...   \n",
       "3        C0DEED  Hi @JordanSpieth - Looking at the url - do you...   \n",
       "4             0  Watching Neighbours on Sky+ catching up with t...   \n",
       "\n",
       "  tweet_coord tweet_count   tweet_created      tweet_id   tweet_location  \\\n",
       "0         NaN      110964  10/26/15 12:40  6.587300e+17  main; @Kan1shk3   \n",
       "1         NaN        7471  10/26/15 12:40  6.587300e+17              NaN   \n",
       "2         NaN        5617  10/26/15 12:40  6.587300e+17           clcncl   \n",
       "3         NaN        1693  10/26/15 12:40  6.587300e+17    Palo Alto, CA   \n",
       "4         NaN       31462  10/26/15 12:40  6.587300e+17              NaN   \n",
       "\n",
       "                user_timezone  \n",
       "0                     Chennai  \n",
       "1  Eastern Time (US & Canada)  \n",
       "2                    Belgrade  \n",
       "3  Pacific Time (US & Canada)  \n",
       "4                         NaN  \n",
       "\n",
       "[5 rows x 26 columns]"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Gender - female: 6700\n",
      "Gender - male: 6194\n",
      "Gender - brand: 5942\n"
     ]
    }
   ],
   "source": [
    "##拿到y值\n",
    "series_female = data['gender'] == 'female'\n",
    "series_male = data['gender'] == 'male'\n",
    "series_brand = data['gender'] == 'brand'\n",
    "\n",
    "df_female = data[series_female]\n",
    "df_male = data[series_male]\n",
    "df_brand = data[series_brand]\n",
    "\n",
    "#三种性别的个数\n",
    "print(f'Gender - female: {df_female[\"gender\"].count()}')\n",
    "print(f'Gender - male: {df_male[\"gender\"].count()}')\n",
    "print(f'Gender - brand: {df_brand[\"gender\"].count()}')\n",
    "\n",
    "#将性别进行人工编码，female=0，male=1， brand=2\n",
    "df_female.gender = 0\n",
    "df_male.gender = 1\n",
    "df_brand.gender = 2\n",
    "\n",
    "#合并y值结果数据,行合并，ignore_index=True，默认columns相同，index纵向相加，若列合并，axis=1,但列数量必须相同\n",
    "df = pd.concat([df_female, df_male, df_brand], ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 18836 entries, 0 to 18835\n",
      "Data columns (total 26 columns):\n",
      "_unit_id                 18836 non-null int64\n",
      "_golden                  18836 non-null bool\n",
      "_unit_state              18836 non-null object\n",
      "_trusted_judgments       18836 non-null int64\n",
      "_last_judgment_at        18786 non-null object\n",
      "gender                   18836 non-null int64\n",
      "gender:confidence        18836 non-null float64\n",
      "profile_yn               18836 non-null object\n",
      "profile_yn:confidence    18836 non-null float64\n",
      "created                  18836 non-null object\n",
      "description              15522 non-null object\n",
      "fav_number               18836 non-null int64\n",
      "gender_gold              50 non-null object\n",
      "link_color               18836 non-null object\n",
      "name                     18836 non-null object\n",
      "profile_yn_gold          50 non-null object\n",
      "profileimage             18836 non-null object\n",
      "retweet_count            18836 non-null int64\n",
      "sidebar_color            18836 non-null object\n",
      "text                     18836 non-null object\n",
      "tweet_coord              157 non-null object\n",
      "tweet_count              18836 non-null int64\n",
      "tweet_created            18836 non-null object\n",
      "tweet_id                 18836 non-null float64\n",
      "tweet_location           11995 non-null object\n",
      "user_timezone            11653 non-null object\n",
      "dtypes: bool(1), float64(3), int64(6), object(16)\n",
      "memory usage: 3.6+ MB\n"
     ]
    }
   ],
   "source": [
    "#寻找x的特征，性别判别，先直观圈选\n",
    "# + (_golden) golden标准，二值，估计是一种荣耀徽章之类的？\n",
    "# + (_trusted_judgments) 可信的评论数量，男女品牌 评论数理论上应该不同吧。\n",
    "# + (description) 个人简介，太有用了，但是是text文本，需要加大处理力度了。\n",
    "# + (fav_number) 关注人数，男女品牌关注行为可能不同。\n",
    "# + (gender_gold) 性别 黄金？什么鬼\n",
    "# + (link_color) link颜色，颜色一向跟性别强相关，拿下先。\n",
    "# + (name) 名称，理论上是有差别的。\n",
    "# + (retweet_count) 被关注数？女性 跟品牌 容易被关注？\n",
    "# + (sidebar_color) 头像边框颜色，理论上还是有特征差异的。\n",
    "# + (text) 随机抽取的一条推文，推文理论上能一定程度反应，但量应该多点。\n",
    "# + (tweet_coord) 开启定位，如果开启则显示经纬度，可以考虑做成二值特征。\n",
    "# + (tweet_count) 推文数量，不同性别这个指标活跃度不同？\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "##由于特征中包含了文本属性，文本的特征性，且信息区分度可能更高，进行优先处理\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics import accuracy_score,confusion_matrix\n",
    "from sklearn import preprocessing\n",
    "\n",
    "##文本预处理函数,符号移除，大小写转换\n",
    "import re\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "def normalize_text(s, type):\n",
    "    s = str(s)\n",
    "    s = s.lower()\n",
    "    s = re.sub('\\s\\W', ' ', s)\n",
    "    s = re.sub('\\W\\s', ' ', s)\n",
    "    s = re.sub('\\s+', ' ', s)\n",
    "    s = re.sub('[^a-zA-Z]',' ', s)\n",
    "    if (type == 'name'):\n",
    "        s = re.sub('_', ' ', s)\n",
    "    return s.strip()\n",
    "\n",
    "df['description_norm'] = [normalize_text(s,'description') for s in df['description']]\n",
    "df['name_norm'] = [normalize_text(s,'name') for s in df['name']]\n",
    "df['text_norm'] = [normalize_text(s,'text') for s in df['text']]\n",
    "\n",
    "y = df['gender']\n",
    "df['description'] = df['description'].fillna(\" \")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_score(y_t, y_p, confusion):\n",
    "    text_accuracy = accuracy_score(y_t, y_p)\n",
    "\n",
    "    text_0_recall  = confusion[0,0]/(confusion[0,0]+confusion[0,1]+confusion[0,2])\n",
    "    text_0_precision = confusion[0,0]/(confusion[0,0]+confusion[1,0]+confusion[2,0])\n",
    "    text_1_recall  = confusion[1,1]/(confusion[1,0]+confusion[1,1]+confusion[1,2])\n",
    "    text_1_precision = confusion[1,1]/(confusion[0,1]+confusion[1,1]+confusion[2,1])\n",
    "    text_2_recall  = confusion[2,2]/(confusion[2,0]+confusion[2,1]+confusion[2,2])\n",
    "    text_2_precision = confusion[2,2]/(confusion[0,2]+confusion[1,2]+confusion[2,2])\n",
    "\n",
    "    text_avg_precision = (text_0_precision+text_1_precision+text_2_precision)/3\n",
    "    text_avg_recall = (text_0_recall+text_1_recall+text_2_recall)/3\n",
    "\n",
    "    print(f'0-precision: {text_0_precision}')\n",
    "    print(f'0-recall: {text_0_recall}')\n",
    "\n",
    "    print(f'1-precision: {text_1_precision}')\n",
    "    print(f'1-recall: {text_1_recall}')\n",
    "\n",
    "    print(f'2-precision: {text_2_precision}')\n",
    "    print(f'2-recall: {text_2_recall}')\n",
    "\n",
    "    print(f'avg-precison: {text_avg_precision}')\n",
    "    print(f'avg-recall: {text_avg_recall}')\n",
    "    print(f'accuracy: {text_accuracy}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "confusion_matrix: \n",
      "[[2015  189  104]\n",
      " [1271  720  209]\n",
      " [ 567  193 1325]]\n",
      "0-precision: 0.5229691149753439\n",
      "0-recall: 0.8730502599653379\n",
      "1-precision: 0.6533575317604355\n",
      "1-recall: 0.32727272727272727\n",
      "2-precision: 0.8089133089133089\n",
      "2-recall: 0.6354916067146283\n",
      "avg-precison: 0.6617466518830295\n",
      "avg-recall: 0.6119381979842312\n",
      "accuracy: 0.6158046412862126\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(12243, 66195)"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "###使用文本进行模型训练\n",
    "from sklearn.naive_bayes import MultinomialNB,BernoulliNB\n",
    "\n",
    "c_vec_s = CountVectorizer(analyzer='word',stop_words='english')\n",
    "df_text = pd.concat([df['description_norm'], df['name_norm'] , df['text_norm']], axis=1)\n",
    "# 要做类型转换\n",
    "\n",
    "x_text_count = c_vec_s.fit_transform((df['text']+df['description']).tolist()) \n",
    "# x_text_count = c_vec_s.fit_transform((df['text_norm']+df['description_norm']).tolist()) \n",
    "x_train, x_test, y_train, y_test = train_test_split(x_text_count, y, test_size = 0.35)\n",
    "\n",
    "#贝叶斯\n",
    "nb = BernoulliNB()\n",
    "nb.fit(x_train, y_train)\n",
    "\n",
    "y_predict = nb.predict(x_test)\n",
    "text_confusion = confusion_matrix(y_test, y_predict,labels=[0,1,2])\n",
    "print(f'confusion_matrix: \\n{text_confusion}')\n",
    "\n",
    "print_score(y_test, y_predict, text_confusion)\n",
    "\n",
    "x_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "###试图优化text维度的模型，从以下几个方面优化\n",
    "#+ 特征处理，弃用countvector，采用tfidf\n",
    "#+ 贝叶斯的参数\n",
    "#+ 更换模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "confusion_matrix: \n",
      "[[2000  230   86]\n",
      " [1252  737  195]\n",
      " [ 567  208 1318]]\n",
      "0-precision: 0.5236973029588897\n",
      "0-recall: 0.8635578583765112\n",
      "1-precision: 0.6272340425531915\n",
      "1-recall: 0.3374542124542125\n",
      "2-precision: 0.8242651657285803\n",
      "2-recall: 0.6297181079789775\n",
      "avg-precison: 0.6583988370802204\n",
      "avg-recall: 0.610243392936567\n",
      "accuracy: 0.6150462611861065\n"
     ]
    }
   ],
   "source": [
    "#使用tfidf\n",
    "t_vec_s = TfidfVectorizer(analyzer='word',stop_words='english')\n",
    "# text_t_x = t_vec_s.fit_transform((df['text_norm']+df['description_norm']).tolist()) \n",
    "x_text_tfidf = t_vec_s.fit_transform((df['text']+df['description']).tolist()) \n",
    "\n",
    "x_train, x_test, y_train, y_test = train_test_split(x_text_tfidf, y, test_size = 0.35)\n",
    "\n",
    "#贝叶斯\n",
    "nb_t = BernoulliNB()\n",
    "nb_t.fit(x_train, y_train)\n",
    "y_predict = nb_t.predict(x_test)\n",
    "text_confusion = confusion_matrix(y_test, y_predict,labels=[0,1,2])\n",
    "print(f'confusion_matrix: \\n{text_confusion}')\n",
    "print_score(y_test,y_predict,text_confusion)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(12243, 66195)\n",
      "confusion_matrix: \n",
      "[[1653  529  171]\n",
      " [ 796 1092  272]\n",
      " [ 319  338 1423]]\n",
      "0-precision: 0.5971820809248555\n",
      "0-recall: 0.7025074373140672\n",
      "1-precision: 0.557427258805513\n",
      "1-recall: 0.5055555555555555\n",
      "2-precision: 0.762593783494105\n",
      "2-recall: 0.6841346153846154\n",
      "avg-precison: 0.6390677077414911\n",
      "avg-recall: 0.630732536084746\n",
      "accuracy: 0.632185651448506\n",
      "confusion_matrix: \n",
      "[[2353    0    0]\n",
      " [2160    0    0]\n",
      " [2080    0    0]]\n",
      "0-precision: 0.3568936751099651\n",
      "0-recall: 1.0\n",
      "1-precision: nan\n",
      "1-recall: 0.0\n",
      "2-precision: nan\n",
      "2-recall: 0.0\n",
      "avg-precison: nan\n",
      "avg-recall: 0.3333333333333333\n",
      "accuracy: 0.3568936751099651\n"
     ]
    }
   ],
   "source": [
    "#更换count的NB算法\n",
    "from sklearn.naive_bayes import BernoulliNB\n",
    "from sklearn.linear_model import LogisticRegressionCV\n",
    "from sklearn.svm import SVC\n",
    "\n",
    "x_train, x_test, y_train, y_test = train_test_split(x_text_count, y, test_size = 0.35)\n",
    "\n",
    "print(x_train.shape)\n",
    "#贝叶斯\n",
    "model = LogisticRegressionCV()\n",
    "model.fit(x_train, y_train)\n",
    "\n",
    "y_predict = model.predict(x_test)\n",
    "text_confusion = confusion_matrix(y_test, y_predict,labels=[0,1,2])\n",
    "print(f'confusion_matrix: \\n{text_confusion}')\n",
    "\n",
    "print_score(y_test, y_predict, text_confusion)\n",
    "\n",
    "#SVM\n",
    "model = SVC()\n",
    "model.fit(x_train, y_train)\n",
    "\n",
    "y_predict = model.predict(x_test)\n",
    "text_confusion = confusion_matrix(y_test, y_predict,labels=[0,1,2])\n",
    "print(f'confusion_matrix: \\n{text_confusion}')\n",
    "\n",
    "print_score(y_test, y_predict, text_confusion)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "##count和tfidf对比来看\n",
    "# + tfidf的对应recall稍微差点，precision好点\n",
    "# + 整体差异不明显，但对于female类型 较显著\n",
    "# + 从整体的角度看，male的precision和recall都比较差，female和brand效果勉强"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "#特征与y的分布，构建一个散点分布函数\n",
    "\n",
    "def drawScatter(x, y, xLabel):\n",
    "    plt.figure(figsize=(10,5))\n",
    "    plt.scatter(x, y)\n",
    "    plt.title('%s VS Gender' %xLabel)\n",
    "    plt.xlabel(xLabel)\n",
    "    plt.ylabel('Gender')\n",
    "    plt.yticks(range(0, 2, 1)) # 纵轴起点，最大值，间隔, 对应的就是gender\n",
    "    plt.grid()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlsAAAFOCAYAAACxNtjEAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAF39JREFUeJzt3Xu47fd8J/D3p4lrjmIkopLIyRBVxONyXFq0J9StLpnBqBQVNWI6xVONjrRG6Zh20BoepePyDIm2EpdRPcRliB7qFuJSQamMBpEgREKCaOIzf6zfSZbjnH2WJN+919n79Xqe/WSt3/Wz9uc5+/fO9/dbv191dwAAGONn1roAAID1TNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAq52VdVVdYvdzDumqt6/2jXtDVb6vQF7L2EL2OtV1bWr6oKqutcu5r2wqt44vb5HVX2wqi6sqvOr6gNVdecVtnt4VZ1cVedV1Xeq6gtV9RdVdfDIzwOsL8IWsNfr7h8keV2S35yfXlX7JDk6yYlV9bNJ3prkL5L8myQHJfnjJJfsapvTCNNpSc5Jcofu/tkkd0/y/5LcY8wnuXKqat+1rgHYPWEL2KWqumNVfaKqvltVb6iq11XVf5+b/4SqOnMaIdpWVTfdzXZuNM3/TlV9JMnNd5p/q6p617Sdz1fVI+bmnVBVL62qU6Y6Tquqm//ETmZOTPKwqrru3LT7ZfZ37u1Jbpkk3X1Sd1/W3d/v7v/b3Z/azfaeneQD3f173X32tO43uvtF3X3yXI0PqqpPTiNrH6yq283NO6uqnlZVn5pG015XVdeem//7VXVuVZ1TVb+10+/lWlX151X15ar6elW9rKquM83bWlVnV9XTq+prSV69m88ALAFhC/gJVXXNJH+b5ITMRoFOSvLv5+bfK8n/SPKIJD+X5EtJTv6JDc28NMkPpuV+a/rZsZ39krwryWuT3DizUai/rKrbzK1/dGYjUDdMcmaSP9nVTrr7g0nOTfLQucmPSfLa7r40yT8nuayqTqyqB1TVDffwa/jVJP9npQWq6o5JXpXkiUlulOTlSbZV1bXmFntEkvsnOSzJ7ZIcM617/yRPS3KfJIdP+5v3vMwC4u2T3CKzkbg/mpt/k8x6c2iSY/fwWYA1JGwBu3K3JPsmeXF3/2t3vynJR+bmPyrJq7r74919SZI/SPKLVbV5fiPTabyHJfmj7r64uz+d2QjUDg9KclZ3v7q7L+3uj2cWcB4+t8ybuvsjU2D6m8zCx+68JtOpxOm04VE79tfd38ns9F8neWWS86YRtwN3s639k3xt7rM8aRq9uqiqXjlNfkKSl3f3adNo2YmZnZa829x2Xtzd53T3+UneMlf/I5K8urs/3d0XZzaStmNfNW37qd19fnd/N8mfJnnk3HZ/lORZ3X1Jd39/hd8JsMaELWBXbprkq/3jT6r/yk7zv7TjTXdflORbmY2+zDsgs9A2v+6X5l4fmuSuU4i5oKouyCzI3WRuma/Nvf5ekk0r1P2aJEdW1UGZBbYzu/sTc3X+U3cf090HJ7nt9DletJttfSuz0bgd676ku28wLX+NufqP26n+Q6bt7qn+m2b3v5cDklw3ycfmtvuOafoO503XqgFLTtgCduXcJAdNIyw7HDL3+pzMgkaSy08H3ijJV3faznlJLt1p3ZvNvf5Kkvd29w3mfjZ1929fmaK7+8tJ/iGzwPaYzMLX7pb9XGanSW+7m0VOzY+fktyVryT5k53qv253n7RAuedm97+Xbyb5fpLbzG33+t09HzTngzCwxIQtYFc+lOSyJE+qqn2r6qgkd5mb/9okj6uq20/XJ/1pktO6+6z5jXT3ZUnelOTZVXXdqrp1ksfOLfLWJLesqsdU1TWmnztX1S9chdpPTPKkzL45+Dc7Jk4X4h+347YNVXVIZteDfXg323l2kntW1f+cRspSVfsnma/tlUn+U1XdtWb2q6oHVtX1Fqjz9UmOqapbTxf1P2vHjO7+0bTtF1bVjad9H1RV91vkFwAsF2EL+And/cPMRnUen+SCJI/OLBhdMs0/NckzM7u+6tzMvmH4yF1ubBZ8NmV2Ou2EzH1zbroW6b7TuudMyzwvybV+YiuLe2NmF9Of2t3nzk3/bpK7Jjmtqi7OLGR9Oslxu9pId/9zZtdeHZzkH6vqu0k+MNX5zGmZ0zO7tuolSb6d2QX8xyxSZHe/PbNTku+Z1nvPTos8fZr+4ar6TpJ3J/n5RbYNLJf68UsyAHatqk5L8rLudpsBgJ+CkS1gl6rqV6rqJtNpxMdmdtuCd6x1XQB7G3cdhg2sqm6W5LO7mf38zK4r2pTZXdMfvtNpOQAW4DQiAMBATiMCAAwkbAEADLRU12ztv//+vXnz5lXb38UXX5z99ttv1fbHyvRjuejH8tGT5aIfy2e1e/Kxj33sm919wJ6WW6qwtXnz5px++umrtr/t27dn69atq7Y/VqYfy0U/lo+eLBf9WD6r3ZOq+tKel3IaEQBgKGELAGAgYQsAYCBhCwBgIGELAGAgYQsAYCBhCwBgIGELAGAgYQsAYKCluoP8atl8/ClJkuOOuDTHTK/Peu4D17IkAOBKWvbj+oYb2drRkEWnAwDLa284rm+4sAUAsJqELQCAgYQtAICBhC0AgIE2XNja3bcTlulbCwDAYvaG4/qGvPXDjgZs3749Zz1q69oWAwBcJct+XN9wI1sAAKtJ2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGGiPYauq9qmqd69GMQAA680ew1Z3X5bke1V1/VWoBwBgXdl3weV+kOSMqnpXkot3TOzupwypCgBgnVg0bJ0y/QAA8FNYKGx194lVdZ0kN+vuzw+uCQBg3Vjo24hV9eAkn0zyjun97atq28jCAADWg0Vv/fDsJHdJckGSdPcnkxw2qCYAgHVj0bB1aXdfuNO0vrqLAQBYbxa9QP7TVfUbSfapqsOTPCXJB8eVBQCwPiw6svXkJLdJckmSk5J8J8nvjioKAGC9WPTbiN9L8ozpBwCABa0YtqrqLVnh2qzufsjVXhEAwDqyp5GtP5/++9AkN0ny19P7o5OctdKKVfWqJA9K8o3uvu1VqBEAYK+1Ytjq7vcmSVU9p7t/eW7WW6rqfXvY9glJXpLkNVepwgE2Hz+7Gf5xR1yaY6bXZz33gWtZEgBwJS37cX3RC+QPqKp/u+NNVR2W5ICVVuju9yU5/yrUNsSOhiw6HQBYXnvDcX3RWz88Ncn2qvri9H5zkicOqQgAYB2p7sXuTVpV10pyq+nt57r7kgXW2ZzkrStds1VVxyY5NkkOPPDAO5188skL1XNlnfHVK+7NeuB1kq9//4p5Rxx0/aH7ZmUXXXRRNm3atNZlMNGP5aMny0U/lsNaHtePPPLIj3X3lj0t99OErV/KbETr8tGw7l7xeqxFwta8LVu29Omnn75QPVfW/LDicUdcmhecccXg3jKd392Itm/fnq1bt651GUz0Y/noyXLRj+Wwlsf1qloobC10GrGq/irJzTN7GPVl0+TOEl78DgCwTBa9QH5Lkrt393/u7idPP09ZaYWqOinJh5L8fFWdXVWPv6rFXh12l3KNagHA3mdvOK4v/GzEzO6zde6iG+7uo69URatgRwO2b9+esx61dW2LAQCukmU/ri8atvZP8tmq+khmz0dM4g7yAAB7smjYevbIIgAA1qtFH0T93qo6NMnh3f3uqrpukn3GlgYAsPdb6AL5qnpCkjcmefk06aAkbx5VFADAerHotxF/J8ndk3wnSbr7C0luPKooAID1YtGwdUl3/3DHm6raN7P7bAEAsIJFw9Z7q+oPk1ynqu6T5A1J3jKuLACA9WHRsHV8kvOSnJHZcwxP6e5nDKsKAGCdWDFsVdVRVfU73f2j7n5lkkMzu5v8H1bVw1elQgCAvdieRrb+S5Jtc++vmeROSbYm+e1BNQEArBt7us/WNbv7K3Pv39/d5yc5v6r2G1gXAMC6sKeRrRvOv+nuJ829PeDqLwcAYH3ZU9g6bbqh6Y+pqicm+ciYkgAA1o89nUZ8apI3V9VvJPn4NO1OSa6V5N+NLAwAYD1YMWx19zeS/FJV3SvJbabJp3T3e4ZXBgCwDiz6IOr3JBGwAAB+Sove1BQAgCtB2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhI2AIAGEjYAgAYSNgCABhoaNiqqvtX1eer6syqOn7kvgAAltG+ozZcVfskeWmS+yQ5O8lHq2pbd3921D4Xtfn4U5Ikxx1xaY6ZXp/13AeuZUkAwJW07Mf1kSNbd0lyZnd/sbt/mOTkJEcN3N9CdjRk0ekAwPLaG47rI8PWQUm+Mvf+7GkaAMCGUd09ZsNV/yHJ/br7P07vH5PkLt395J2WOzbJsUly4IEH3unkk08eUs8OZ3z1wstfH3id5Ovfv2LeEQddf+i+WdlFF12UTZs2rXUZTPRj+ejJctGP5bCWx/UjjzzyY929ZU/LDbtmK7ORrEPm3h+c5JydF+ruVyR5RZJs2bKlt27dOrCkXH4uN5md233BGVf8Cs561Nh9s7Lt27dndP9ZnH4sHz1ZLvqxHPaG4/rI04gfTXJ4VR1WVddM8sgk2wbuDwBg6QwLW919aZInJXlnkn9K8vru/syo/S1qd99OWKZvLQAAi9kbjusjTyOmu9+W5G0j93Fl7GjA9u3bl2aIEQC4cpb9uO4O8gAAAwlbAAADCVsAAAMJWwAAAwlbAAADCVsAAAMJWwAAAwlbAAADCVsAAANVd691DZerqvOSfGkVd7l/km+u4v5YmX4sF/1YPnqyXPRj+ax2Tw7t7gP2tNBSha3VVlWnd/eWta6DGf1YLvqxfPRkuejH8lnWnjiNCAAwkLAFADDQRg9br1jrAvgx+rFc9GP56Mly0Y/ls5Q92dDXbAEAjLbRR7YAAIZa92Grqu5fVZ+vqjOr6vhdzL9WVb1umn9aVW1e/So3lgV68ntV9dmq+lRVnVpVh65FnRvFnvoxt9zDq6qraum+6bOeLNKPqnrE9G/kM1X12tWucaNZ4G/Wzarq76vqE9PfrV9bizo3iqp6VVV9o6o+vZv5VVUvnvr1qaq642rXuLN1Hbaqap8kL03ygCS3TnJ0Vd16p8Uen+Tb3X2LJC9M8rzVrXJjWbAnn0iypbtvl+SNSZ6/ulVuHAv2I1V1vSRPSXLa6la4sSzSj6o6PMkfJLl7d98mye+ueqEbyIL/Rv5rktd39x2SPDLJX65ulRvOCUnuv8L8ByQ5fPo5Nsn/WoWaVrSuw1aSuyQ5s7u/2N0/THJykqN2WuaoJCdOr9+Y5N5VVatY40azx55099939/emtx9OcvAq17iRLPJvJEmek1no/cFqFrcBLdKPJyR5aXd/O0m6+xurXONGs0hPOsnPTq+vn+ScVaxvw+nu9yU5f4VFjkrymp75cJIbVNXPrU51u7bew9ZBSb4y9/7sadoul+nuS5NcmORGq1LdxrRIT+Y9Psnbh1a0se2xH1V1hySHdPdbV7OwDWqRfx+3THLLqvpAVX24qlb6P3yuukV68uwkj66qs5O8LcmTV6c0duOnPc4Mt+9a7nwV7GqEauevXy6yDFefhX/fVfXoJFuS/MrQija2FftRVT+T2en1Y1aroA1ukX8f+2Z2emRrZqO+/1BVt+3uCwbXtlEt0pOjk5zQ3S+oql9M8ldTT340vjx2YemO6+t9ZOvsJIfMvT84Pzm8e/kyVbVvZkPAKw1PctUs0pNU1a8meUaSh3T3JatU20a0p35cL8ltk2yvqrOS3C3JNhfJD7Po36y/6+5/7e5/SfL5zMIXYyzSk8cneX2SdPeHklw7s2f0sTYWOs6spvUetj6a5PCqOqyqrpnZhYvbdlpmW5LHTq8fnuQ97eZjI+2xJ9Npq5dnFrRcjzLWiv3o7gu7e//u3tzdmzO7hu4h3X362pS77i3yN+vNSY5MkqraP7PTil9c1So3lkV68uUk906SqvqFzMLWeataJfO2JfnN6VuJd0tyYXefu5YFrevTiN19aVU9Kck7k+yT5FXd/Zmq+m9JTu/ubUn+d2ZDvmdmNqL1yLWreP1bsCd/lmRTkjdM31X4cnc/ZM2KXscW7AerZMF+vDPJfavqs0kuS/L73f2ttat6fVuwJ8cleWVVPTWz01XH+J/2carqpMxOo+8/XSf3rCTXSJLufllm1839WpIzk3wvyePWptIruIM8AMBA6/00IgDAmhK2AAAGErYAAAYStgAABhK2AAAGErYAAAYStoB1q6pOqKqH72L61qryrEdgVQhbAAADCVvAXqOqnllVn6uqd1XVSVX1tGn67avqw1X1qar626q64S7Wvf+07vuTPHRu+n5V9aqq+mhVfaKqjpqmH1NVb6qqd1TVF6rq+av2QYF1RdgC9grTw68fluQOmYWl+YdhvybJ07v7dknOyOzxHfPrXjvJK5M8OMk9k9xkbvYzMnsm6p0ze+bgn1XVftO82yf59SRHJPn1qpp/uC3AQoQtYG9xjyR/193f7+7vJnlLklTV9ZPcoLvfOy13YpJf3mndWyX5l+7+wvTMur+em3ffJMdX1SeTbM/sIcI3m+adOj2M+wdJPpvk0AGfC1jn1vWDqIF1pa7i+rt7EGwleVh3f/7HJlbdNcklc5Mui7+ZwJVgZAvYW7w/yYOr6tpVtSnJA5Okuy9M8u2quue03GOSvHendT+X5LCquvn0/ui5ee9M8uSqqiSpqjuM+gDAxuT/0oC9Qnd/tKq2JfnHJF9KcnqSC6fZj03ysqq6bpIvJnncTuv+oKqOTXJKVX0zs+B222n2c5K8KMmnpsB1VpIHDf44wAZSs8sXAJZfVW3q7oumUPW+JMd298fXui6AlRjZAvYmr6iqW2d2EfuJghawNxC2gKVTVTdKcuouZt27u7+12vUAXBVOIwIADOTbiAAAAwlbAAADCVsAAAMJWwAAAwlbAAAD/X9byEagy7Z77QAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 720x360 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 - (all:0_rate:1_rate:) => 6700 : 99.77611940298507 : 0.22388059701492538\n",
      "1 - (all:1_rate:1_rate:) => 6194 : 99.66096222150468 : 0.3390377784953181\n",
      "2 - (all:2_rate:1_rate:) => 5942 : 99.76438909458095 : 0.23561090541905083\n"
     ]
    }
   ],
   "source": [
    "# _golden特征探索\n",
    "# from sklearn.feature_selection import LabelEncoder\n",
    "\n",
    "lable_golden = preprocessing.LabelEncoder()\n",
    "lable_golden.fit(['False','True'])\n",
    "_golden_df_tmp = df['_golden']\n",
    "array_golden = lable_golden.transform(_golden_df_tmp)\n",
    "df_golden = pd.DataFrame(array_golden,columns=['_golden'])\n",
    "\n",
    "drawScatter(df_golden, y, '_golden')\n",
    "\n",
    "##貌似分布非常之均匀，0 1 2 对应的golden 01 分布并没有很大区别，继续看细节数据\n",
    "df_golden_check = pd.concat([y, df_golden], axis=1)\n",
    "golden_0 = df_golden_check[df_golden_check[\"gender\"] == 0]._golden.count()\n",
    "golden_0_1 = df_golden_check[df_golden_check[\"gender\"] == 0]._golden.sum()\n",
    "golden_0_0 = golden_0 - golden_0_1\n",
    "\n",
    "golden_1 = df_golden_check[df_golden_check[\"gender\"] == 1]._golden.count()\n",
    "golden_1_1 = df_golden_check[df_golden_check[\"gender\"] == 1]._golden.sum()\n",
    "golden_1_0 = golden_1 - golden_1_1\n",
    "\n",
    "golden_2 = df_golden_check[df_golden_check[\"gender\"] == 2]._golden.count()\n",
    "golden_2_1 = df_golden_check[df_golden_check[\"gender\"] == 2]._golden.sum()\n",
    "golden_2_0 = golden_2 - golden_2_1\n",
    "\n",
    "print(f'0 - (all:0_rate:1_rate:) => {golden_0} : {golden_0_0/golden_0*100} : {golden_0_1/golden_0*100}')\n",
    "print(f'1 - (all:1_rate:1_rate:) => {golden_1} : {golden_1_0/golden_1*100} : {golden_1_1/golden_1*100}')\n",
    "print(f'2 - (all:2_rate:1_rate:) => {golden_2} : {golden_2_0/golden_2*100} : {golden_2_1/golden_2*100}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlsAAAFOCAYAAACxNtjEAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3XuYXXV97/H3lyHACJaAIJYRCKUcFMgjKHIpaoNVQbElRW1BobUqaCty6mPpCeqpqPRAT6y9HD1VaT1eQCJazEHpMSgQKShIuGjkkqLcQrBcjAMmjBiG7/ljrQk7k32by2/2ZOf9ep79zN7r8lvf9ZtF5sNv/fbekZlIkiSpjG16XYAkSVI/M2xJkiQVZNiSJEkqyLAlSZJUkGFLkiSpIMOWJElSQYYtSV2JiOUR8Y4O27wlIq6YZPvzIiIjYtvJVajpFhELIuKBXtchbekMW9IsERH3RsSrCrU9I0EmMy/KzNeUPMZMi4i3RsS1k9jvqIhYHxHPbrLulog4o37+9oi4MyJ+EREPRcTlzfZp2PfVEXF1vf3PIuLWiPhvEbHDRGuUNDMMW9IWwhGfLUtmfg94AHhD4/KIOBg4ELg4In4b+B/AyZn5bOCFwCWt2oyINwFfBb4E7JOZzwH+EHg+sFeJ85gsr1fpGYYtaRaIiC8CewNfj4h1EfGXDaNRb4+I+4Grmt3WaRwRi4jDI2JFRDxej5J8vN7smvrncN3+UfX2b4uIOyLi5xGxLCL2aWj31fWIy2MR8QkgujiPjaNAzUbTGm9FRsRARHwsIh6NiLuB48e1tW9EXFOP4Hw7Ij4ZEReOa/tPImJ1Xf+7IuKlEfHDiBiua25sr925Zr3/XfX6T0blhcCngKPqfhuut39dRNxe17YmIv6iRZd8Hvijccv+CLg8M38GvBT4XmbeApCZazPz85n5iyZ9G8DHgY9k5gWZubbeZ1Vmvicz76q32yYiFkXET+qRr0siYtdx/fbHEXF/3fcfaDjGYER8ru6D2+v6GmvYMyL+NSIeiYh7IuLMhnXnRMRXI+LCiHgceGuLPpG2Ppnpw4ePWfAA7gVe1fB6HpDAF4AdgUFgAfBAq/2A7wGn1s93Ao4c19a2DfstBH5MNZqyLfBB4Lv1ut2Ax4E3AnOA9wJPAe/ocA5vBa5tc8zlY20A7wLupBqR2RW4unH7+lw+BmwHvKyu58JxbX8K2AF4DfBLYCnwXGAIeBj47U7nWq9P4BvAXKrQ+whw3Phzatj+p8DL6+e7AC9u0R97ARuAvevX21CNdi2sX78cGAE+DBwNbN+mb19Q1zmvw+/gz4HrqUa7tgc+DVw8rt8uoLqeXgQ8CbywXn8+8O/172Mv4EfU11td+03AX9W/k98A7gaOrdefU5/rwnrbwV7/N+XDx2x5OLIlzX7nZOb6zBzpYtsNwG9GxG6ZuS4zr2+z7TuB8zLzjsx8iup21iH1iM/rgNsz86uZuQH4e+A/p3oi4/wB8PeZuTqrUZrzxlZExN5Uoyp/lZm/ysxrgcuatPHRzPxlZl4BrKcKFQ9n5hqq0HBoF+c65vzMHM7M+6mC3yFtat8AHBgRv5aZP8/Mm5ttlJmrge8Ap9SLfocqHF5er/934ETgxfWyn0XExyNioElzu9U/N/4eImJJPYr3RESc2nCuH8jMBzLzSaoQ9MZxt/U+nJkjmfkD4AdUoQuq38lfZzXCthr4x4Z9XgrsnpkfqX8nd1OFtpMatvleZi7NzKe7vF6lrYJhS5r9Vk9g27cD/wW4MyJujIjXt9l2H+Af6j/Ww8BaqluFQ8CejcfNzJxgHd3Y5BjAfePWrc3MJxqWNTv+Qw3PR5q83ql+3u5cxzSGySca9m3mDVSB9L6I+M7YbdkWGm8lngp8qQ6wAGTm/8vM36UaTTqBaiSt2bs+f1b//PWGfU/KzLnAzcBYQNsH+FrDud4BjAJ7dHGu7X4n+wB7jrVbt/3+ce1O9zUi9QXDljR7ZBfL1wPPGntRj4DsvnHDzLsy82SqW2l/A3w1InZs0fZq4J2ZObfhMZiZ36W6TbZxwnU9X2iiE7DX1z+f1bDseQ3PNzkG1e27xnW7RkTjvlOZAN7uXDvZrO8y88bMPIGqn5fSZlI7cCkwFBHHUI1ifaHpQarRoCuBq4CDm2xyJ7CmbqOd1cBrx53rDvVoXyftfiergXvGtfvszHxd42l0cQxpq2PYkmaPh6jmwbTzH8AOEXF8RMyhmnu0/djKiDglInbPzKeB4XrxKNUcpKfHtf8p4OyIOKjed+eo3u0G1S2tgyLixPr205lsGpQ6ysxHqMLBKfVk+LcB+zVscglwZkQ8PyJ2ARY17HsfsAI4JyK2q0eOfncixx+n3bl28hDw/IjYrt53u6g+T2zneoTqcao+bioz11O9g/D/APdl5oqxdRFxQkScFBG71BPyDwd+m2rO1fh2Engf8KGIOK1hn/3ZdHTpU8Bfj90ijYjdI+KELs/1Eqp+2iUing+8p2Hd94HHo/qYicH6d3pwRLy0eVOSxhi2pNnjPOCD9S2apu9uy8zHgD8D/pkqyKynmnA95jjgtohYB/wDcFI9p+kJ4K+B6+r2j8zMr1GNfi2p3z32I+C19XEeBd5ENWH6Z8D+wHWTOKfTgLPqNg4CGkeSLgCWUc0ZuplqBKjRW4Cj6n3PBb5MNZl7wtqdaxeuAm4D/jMiHq2XnQrcW7f1Lp6Zk9XK56luw40f1fo5VR/dRf0GAGBxZl7U4jy+TDWv6hSqkaZHqQLSZ4Cv1Jv9A9X8tisi4hdUwe2Irs60mqh/H3APcAXwxYZjj1IF3kPq9Y9SXYc7d9m2tNWK6n+WJGnq6tGrUzLzlQXa/jJwZ2Z+aLrblqSSHNmSNJ0Oohr1mLKoPjNrv/pzo46jmjy+dDralqSZ5Cf8SjOo/kiD21usPrD+2IFZLSI+RfPbZtdSTa7udi5UJ8+jurX4HKpbpX+a9Yd/StKWxNuIkiRJBXkbUZIkqSDDliRJUkGzas7WbrvtlvPmzZuWttavX8+OO+44LW2pe/Z7b9jvM88+7w37febZ563ddNNNj2bm7p22m1Vha968eaxYsaLzhl1Yvnw5CxYsmJa21D37vTfs95lnn/eG/T7z7PPWIuK+zlt5G1GSJKkow5YkSVJBhi1JkqSCDFuSJEkFGbYkSZIKMmxJkiQVZNiSJEkqyLAlSZJUkGFLkiSpoFn1CfIlzVt0+WbL7j3/+B5UIkmaLZbesobFy1bx4PAIe84d5KxjD2DhoUO9LgvYvLZjXrA7V9/5SMvXE6n9LRd8j+t+snaTZUMd2phKX7Xb94NLV3LxDasZzWQggpOP2ItzF86fUJs7D84hAoaf2DDrfo+wlYStZkFrbLmBS5K2TktvWcPZl65kZMMoAGuGRzj70pUAPf9D3ay2C6+/f+P6Zq+7rb1Z0OrUxlT6qt2+K+5bu8l5jGZufN0ucI1vc3hkQ1fn0SveRpQkbZUWL1u18Y/1mJENoyxetqpHFT2jWW2ddFt7s6DVqY2p9FW7fS++YXXTfVotb9fmZGqbKVvFyJYkSeM9ODwyoeUzabI1TEftzdqYSl+12zdb7DOardZM/bi94MiWJGmrtOfcwQktn0mTrWE6am/WxlT6qt2+AxFN17VaPh3H7QXDliRpq3TWsQcwOGdgk2WDcwY469gDelTRM5rV1km3tR+9364TbmMqfdVu35OP2KvpPq2Wt2tzMrXNlK0ibLWaBO/keEnaei08dIjzTpzP0NxBgurdeOedOH9WTKpuVtspR+7d9nW3tV902lFNA1e7NqbSV+32PXfhfE45cu+NI1kDEZxy5N4d3404vs25g3PY5VlzZt3vccxWM2fLYCVJGm/hoUOz6o9yo5K1XXTaURPeZyr1tNv33IXzu/qoh+msZ6ZtFSNbkiRJvWLYkiRJKsiwJUmSVJBhS5IkqSDDliRJUkGGLUmSpIIMW5IkSQUZtiRJkgoybEmSJBVk2JIkSSrIsCVJklSQYUuSJKkgw5YkSVJBhi1JkqSCDFuSJEkFGbYkSZIKMmxJkiQVZNiSJEkqyLAlSZJUkGFLkiSpIMOWJElSQYYtSZKkggxbkiRJBRm2JEmSCjJsSZIkFWTYkiRJKsiwJUmSVJBhS5IkqSDDliRJUkGGLUmSpIIMW5IkSQUZtiRJkgoybEmSJBVk2JIkSSrIsCVJklSQYUuSJKkgw5YkSVJBhi1JkqSCDFuSJEkFGbYkSZIKMmxJkiQVZNiSJEkqyLAlSZJUkGFLkiSpIMOWJElSQYYtSZKkggxbkiRJBRm2JEmSCjJsSZIkFWTYkiRJKsiwJUmSVJBhS5IkqSDDliRJUkGGLUmSpIIMW5IkSQUZtiRJkgoybEmSJBVk2JIkSSrIsCVJklSQYUuSJKkgw5YkSVJBhi1JkqSCDFuSJEkFGbYkSZIKMmxJkiQVZNiSJEkqyLAlSZJUkGFLkiSpIMOWJElSQYYtSZKkggxbkiRJBRm2JEmSCjJsSZIkFWTYkiRJKsiwJUmSVJBhS5IkqSDDliRJUkGGLUmSpIIMW5IkSQUZtiRJkgoybEmSJBVk2JIkSSrIsCVJklSQYUuSJKkgw5YkSVJBhi1JkqSCDFuSJEkFGbYkSZIKMmxJkiQVZNiSJEkqyLAlSZJUkGFLkiSpIMOWJElSQYYtSZKkggxbkiRJBRm2JEmSCjJsSZIkFWTYkiRJKsiwJUmSVJBhS5IkqSDDliRJUkEdw1ZEDETEt2eiGEmSpH7TMWxl5ijwRETsPAP1SJIk9ZVtu9zul8DKiPgWsH5sYWaeWaQqSZKkPtFt2Lq8fkiSJGkCugpbmfn5iBgE9s7MVYVrkiRJ6htdvRsxIn4XuBX4Zv36kIi4rGRhkiRJ/aDbj344BzgcGAbIzFuBfQvVJEmS1De6DVtPZeZj45bldBcjSZLUb7qdIP+jiHgzMBAR+wNnAt8tV5YkSVJ/6HZk6z3AQcCTwMXA48CflypKkiSpX3T7bsQngA/UD0mSJHWpbdiKiK/TZm5WZv7etFckSZLURzqNbH2s/nki8Dzgwvr1ycC97XaMiM8CrwcezsyDp1CjJEnSFqtt2MrM7wBExEcz8xUNq74eEdd0aPtzwCeAL0ypwmkyb9HmH4B/7/nH96ASSZKaW3rLGhYvW8WDwyPsOXeQs449gIWHDnVc16qNZ203wBO/GiWBgQhOPmIvzl04f7O25j1nkOvv/jmjuenNrLmDczjn8Gja/prhEQYiGM3c7OfcwTlEwM+f2NDyXIfqcwC6Oq8tWbfvRtw9In4jM+8GiIh9gd3b7ZCZ10TEvKmVNz2aBa2x5QYuSdJssPSWNZx96UpGNowCsGZ4hLMvXblxfat1jcFkfBvrfzW6cd1oJhdefz/3PLKOm+9/bJO21gyPNK1peGQDD6wdZekta1h46NBm7Y+Fs/E/h0dah6wxa4ZHOOsrP4CADaPZ9ry2dN2+G/G9wPKIWB4Ry4Gr8d2IkiRNm8XLVm0MMWNGNoyyeNmqtus6tTHedT9Z23GbRkluPE437U/EhqdzY9Aa0+y8tnSR2d1nk0bE9sAL6pd3ZuaTXewzD/hGuzlbEXE6cDrAHnvs8ZIlS5Z0VU8n69atY6eddgJg5Zrxn8f6jPlDO0/L8VRp7HfNHPt95tnnvdHP/d7ub1U7jX/HJttGO3sMwkMj1XFKtN/KlvD3+ZhjjrkpMw/rtN1EwtZvAfNouPWYmW3nY3UTthoddthhuWLFiq7q6WT58uUsWLAAaH0bEZy3Nd0a+10zx36fefZ5b/Rzvx99/lVNb+cNzR0EaLnuukWv7NjGVLxv/lMsWf1srlv0yiLtNzP+vGariOgqbHX7RdRfpHpn4suAl9aPjo1LkqTunHXsAQzOGdhk2eCcAc469oC26zq1Md7R++3acZtGQWw8TjftT8ScbYI5A5tOwG92Xlu6bifIHwYcmN0OgwERcTGwANgtIh4APpSZ/zLxEqfu3vOP992IkqRZbWxCeLt35nV61974Nqbj3YjP33W7je02tu+7EbvX9XcjUn3O1k+7bTgzT55URYUYrCRJs93CQ4daBo126ya6XbdtQXXrdrL7dqvfwtV43Yat3YDbI+L7VN+PCPgJ8pIkSZ10G7bOKVmEJElSv+r2i6i/ExH7APtn5rcj4lnA9M2QkyRJ6lPdvhvxNOCrwKfrRUPA0lJFSZIk9YtuP0H+3cDRwOMAmXkX8NxSRUmSJPWLbsPWk5n5q7EXEbEt0PXHQEiSJG2tug1b34mI9wODEfFq4CvA18uVJUmS1B+6DVuLgEeAlVTfY3h5Zn6gWFWSJEl9om3YiogTIuLdmfl0Zl4A7EP1afLvj4g3zkiFkiRJW7BOI1t/CVzW8Ho74CVUX8Pzp4VqkiRJ6hudPmdru8xc3fD62sxcC6yNiB0L1iVJktQXOo1s7dL4IjPPaHi5+/SXI0mS1F86ha0b6g803UREvBP4fpmSJEmS+ken24jvBZZGxJuBm+tlLwG2BxaWLEySJKkftA1bmfkw8FsR8UrgoHrx5Zl5VfHKJEmS+kC3X0R9FWDAkiRJmqBuP9RUkiRJk2DYkiRJKsiwJUmSVJBhS5IkqSDDliRJUkGGLUmSpIIMW5IkSQUZtiRJkgoybEmSJBVk2JIkSSrIsCVJklSQYUuSJKkgw5YkSVJBhi1JkqSCDFuSJEkFGbYkSZIKMmxJkiQVZNiSJEkqyLAlSZJUkGFLkiSpIMOWJElSQYYtSZKkggxbkiRJBRm2JEmSCjJsSZIkFWTYkiRJKsiwJUmSVJBhS5IkqSDDliRJUkGGLUmSpIIMW5IkSQUZtiRJkgoybEmSJBVk2JIkSSrIsCVJklSQYUuSJKkgw5YkSVJBhi1JkqSCDFuSJEkFGbYkSZIKMmxJkiQVZNiSJEkqyLAlSZJUkGFLkiSpIMOWJElSQYYtSZKkggxbkiRJBRm2JEmSCjJsSZIkFWTYkiRJKsiwJUmSVJBhS5IkqSDDliRJUkGGLUmSpIIMW5IkSQUZtiRJkgoybEmSJBVk2JIkSSrIsCVJklSQYUuSJKkgw5YkSVJBhi1JkqSCDFuSJEkFGbYkSZIKMmxJkiQVZNiSJEkqyLAlSZJUkGFLkiSpIMOWJElSQYYtSZKkggxbkiRJBRm2JEmSCjJsSZIkFWTYkiRJKsiwJUmSVJBhS5IkqSDDliRJUkGGLUmSpIIMW5IkSQUZtiRJkgoybEmSJBVk2JIkSSrIsCVJklSQYUuSJKkgw5YkSVJBhi1JkqSCDFuSJEkFGbYkSZIKMmxJkiQVZNiSJEkqyLAlSZJUkGFLkiSpIMOWJElSQYYtSZKkggxbkiRJBRm2JEmSCjJsSZIkFVQ0bEXEcRGxKiJ+HBGLSh5LkiRpNtq2VMMRMQB8Eng18ABwY0Rclpm3lzpmO/MWXb7ZsnvPP74HlUiStlYfXLqSi29YzWjmxmVDcwc569gDWHjo0JTaXnrLGhYvW8WDwyPsWbcJbLas03HecsH3uO4naze+ft/8p/jA+VdNqsbG8x2I4OQj9uLchfM3qXXnwTlEwPATG7qqsdl5jm3fbl0vFQtbwOHAjzPzboCIWAKcAMx42GoWtMaWG7gkSTPhg0tXcuH192+2fM3wCGdfuhJg0sFg6S1rOPvSlYxsGN3Y5llf/QEkbHg6uz7O+KA1lRrHn+9oJhdefz/3PLKOm+9/bGOtwyMbuj5Os/Mc2x5oua7XgavkbcQhYHXD6wfqZZIkbXUuvmF1y3UjG0ZZvGzVpNtevGzVxpAxZsNobgxa3R6nWdCabI2tzve6n6zdrNZuj9PsPMe2b7eu1yIzO281mYYj3gQcm5nvqF+fChyeme8Zt93pwOkAe+yxx0uWLFkyLcdft24dO+20EwAr1zzWcrv5QztPy/FUaex3zRz7febZ572xJfd7u79FYyb7N6mbtrs5TrN29hiEh0Y67zvVmsZrdpzJtlnqb/0xxxxzU2Ye1mm7kmHrKOCczDy2fn02QGae12qfww47LFesWDEtx1++fDkLFiwAWt9GBOdtTbfGftfMsd9nnn3eG1tyv+939r9tMldrvKG5g1y36JWTavvo869izfBI5w07HKfZ38v3zX+Kv1257YRr7HS+k6mx1XkOzR0EaLlusv3aSUR0FbZK3ka8Edg/IvaNiO2Ak4DLCh5PkqRZ6+Qj9mq5bnDOwMYJ7ZNx1rEHMDhnYJNlcwaCOdvEhI5z9H67TluNrc736P123azWbo/T7DzHtm+3rteKha3MfAo4A1gG3AFckpm3lTpeO61GrxzVkiTNlHMXzueUI/dmIDYNQENzBznvxPlTmsS98NAhzjtxPkNzB4m6zcVvfBGL3/SiTZZ1Os5Fpx3VNHBNpsbx5zsQwSlH7s1Fpx21Sa1zB+ewy7PmdFVjs/Mc277dul4rdhtxMkrdRtTMsd97w36fefZ5b9jvM88+b2023EaUJEna6hm2JEmSCjJsSZIkFWTYkiRJKsiwJUmSVJBhS5IkqSDDliRJUkGGLUmSpIIMW5IkSQXNqk+Qj4hHgPumqbndgEenqS11z37vDft95tnnvWG/zzz7vLV9MnP3ThvNqrA1nSJiRTcfoa/pZb/3hv0+8+zz3rDfZ559PnXeRpQkSSrIsCVJklRQP4etz/S6gK2U/d4b9vvMs897w36fefb5FPXtnC1JkqTZoJ9HtiRJknquL8NWRBwXEasi4scRsajX9fSriLg3IlZGxK0RsaJetmtEfCsi7qp/7tLrOrd0EfHZiHg4In7UsKxpP0flH+tr/4cR8eLeVb5la9Hv50TEmvqavzUiXtew7uy631dFxLG9qXrLFhF7RcTVEXFHRNwWEf+1Xu71XlCbfvd6nyZ9F7YiYgD4JPBa4EDg5Ig4sLdV9bVjMvOQhrcFLwKuzMz9gSvr15qazwHHjVvWqp9fC+xfP04H/mmGauxHn2Pzfgf4u/qaPyQz/w2g/jfmJOCgep//Xf9bpIl5CnhfZr4QOBJ4d923Xu9ltep38HqfFn0XtoDDgR9n5t2Z+StgCXBCj2vampwAfL5+/nlgYQ9r6QuZeQ2wdtziVv18AvCFrFwPzI2IX5+ZSvtLi35v5QRgSWY+mZn3AD+m+rdIE5CZP83Mm+vnvwDuAIbwei+qTb+34vU+Qf0YtoaA1Q2vH6D9RaPJS+CKiLgpIk6vl+2RmT+F6j9g4Lk9q66/tepnr//yzqhvWX224Ta5/T7NImIecChwA17vM2Zcv4PX+7Tox7AVTZb5lssyjs7MF1MN5b87Il7R64Lk9V/YPwH7AYcAPwX+tl5uv0+jiNgJ+FfgzzPz8XabNllmv09Sk373ep8m/Ri2HgD2anj9fODBHtXS1zLzwfrnw8DXqIaRHxobxq9/Pty7Cvtaq372+i8oMx/KzNHMfBq4gGdundjv0yQi5lD9wb8oMy+tF3u9F9as373ep08/hq0bgf0jYt+I2I5qEt9lPa6p70TEjhHx7LHnwGuAH1H19R/Xm/0x8H97U2Hfa9XPlwF/VL9L60jgsbHbL5q6cfOBfp/qmoeq30+KiO0jYl+qCdvfn+n6tnQREcC/AHdk5scbVnm9F9Sq373ep8+2vS5gumXmUxFxBrAMGAA+m5m39bisfrQH8LXqv1G2Bb6Umd+MiBuBSyLi7cD9wJt6WGNfiIiLgQXAbhHxAPAh4Hya9/O/Aa+jmrD6BPAnM15wn2jR7wsi4hCqWyb3Au8EyMzbIuIS4Haqd3a9OzNHe1H3Fu5o4FRgZUTcWi97P17vpbXq95O93qeHnyAvSZJUUD/eRpQkSZo1DFuSJEkFGbYkSZIKMmxJkiQVZNiSJEkqyLAlSZJUkGFL0oRExNyI+LNpbG9BRPzWJPa7NyJ2a7P+u5Oo4xsTrWO6RMT7e3VsSWUZtiRN1FygadiKiIFJtLcAmHDY6iQzp73NwgxbUp8ybEmaqPOB/SLi1ohYXI8IXR0RX6L6BOp5ETH2tR5ExF9ExDn18zMj4vaI+GFELImIecC7gPfW7b08InaPiH+NiBvrx9H1vs+JiCsi4paI+DTNvwx3o4hYV//cZMQqIj4REW+tnx8XEXdGxLXAiQ3b7B4R34qImyPi0xFxX0TsVp/bnRHxzxHxo4i4KCJeFRHXRcRdEXF4vf+OEfHZuv5bIuKEevlbI+LSiPhmvf3/rJefDwzWfXBRvf/lEfGD+jh/OKXfmKSe6ruv65FU3CLg4Mw8BKowQ/UFtQdn5j11gGq3776Z+WREzM3M4Yj4FLAuMz9Wt/cl4O8y89qI2Jvqq7deSPV1Oddm5kci4njg9KmcRETsQPXluq+k+rqXLzes/hBwVWaeFxHHjTvWb1J9XczpVN/F+mbgZcDvUY1OLQQ+UO//toiYC3w/Ir5d738IcCjwJLAqIv5XZi6KiDMa+vQNwIOZeXz9euepnKuk3nJkS9J0+H5m3tPFdj8ELoqIU6i+U62ZVwGfqL+j7TLg16L60vNXABcCZOblwM+nWPMLgHsy866svrfswoZ1LwOW1Mf65rhj3ZOZKzPzaeA24Mp6/5XAvHqb1wCL6nNYDuwA7F2vuzIzH8vMX1J9t9w+TWpbCbwqIv4mIl6emY9N8Vwl9ZAjW5Kmw/qG50+x6f/I7dDw/Hiq0PR7wH+PiIOatLUNcFRmjjQurL/0fDJf5tqunlbttbtF+WTD86cbXj/NM/+mBvCGzFy1SaMRR4zbf5Qm/w5n5n9ExEuovmT5vIi4IjM/0qYmSbOYI1uSJuoXwLPbrH8IeG49x2p74PUAEbENsFdmXg38JdVE+52atHcFcMbYi4g4pH56DfCWetlrgV26rPc+4MCI2L6+Hfc79fI7gX0jYr/69ckN+1wL/EF9rNdM4FhjlgHviTohRsShXeyzISLm1NvvCTyRmRcCHwNePMHjS5pFDFuSJiQzfwZcV0/cXtxk/QbgI8ANwDeoQg3AAHBhRKwEbqGalzUMfB34/bEJ8sCZwGH1JPrbqSbQA3wYeEVE3Ex1m+7+TqXW9awGLqG+hVkfm/o23unA5fUE+fsa9v0w8Jr6WK8FfkoVCrv1UWAO8MP6zQIf7WKfz9TbXwTMp5rndStbbe1iAAAAi0lEQVTV/K9zJ3BsSbNMVFMNJKl/RMRzgJszs9l8qG723x4YzcynIuIo4J/GJq9L0kQ5Z0tSX6lvwS2nuv02WXsDl9S3Pn8FnDYNpUnaSjmyJamleoToyiarfqe+ndhTs70+SQLDliRJUlFOkJckSSrIsCVJklSQYUuSJKkgw5YkSVJBhi1JkqSC/j9koT1wyG31pgAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 720x360 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#从上面golden的分布来看，male=1的 这个golden金牌占比还是大一丢丢的，稍微有点差异，具体要不要加入到特征里，在看\n",
    "\n",
    "# _trusted_judgments 特征,  always 3 for non-golden, 3是non_golden的默认值\n",
    "df_trusted_judgments = df['_trusted_judgments']\n",
    "df_trusted_judgments.describe()\n",
    "drawScatter(df_trusted_judgments, y, '_trusted_judgments')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAl0AAAFOCAYAAAC8KKiDAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3XuYXFWd7vH3TdNgB5QOF1HaQACZOGIEJAqK4wTOaPBK5IAHHC+M53hncBSjRD2ScVBwMl6Ol0cdHlEcNEEQMyijGRRaZ1CiYUIIDEYuAqFBQEPAQAOdzu/8sXd3qqvrsqu7anV19ffzPP2katfaa639q13db/beVeWIEAAAAFpr1lRPAAAAYCYgdAEAACRA6AIAAEiA0AUAAJAAoQsAACABQhcAAEAChC4AAIAECF1Ah7E93/Z623+yfeZUz2eybIftZ0/1PNoV9QGmD0IX0Hk+JKk/Ip4aEV+Y6sm0K9tPsb3V9vEVHvuc7cvy2y+1/QvbD9veYvta2y+s0e+htlfZftD2I7Zvtf1F289q5fYAaH+ELqDzHCjp5qmeRLuxvUvp/Yh4XNIlkt5S1q5L0mmSLrL9NEk/lPRFSXtJ6pP095KeqDLGsyWtlXSvpCMj4mmSjpV0u6SXNnN7Jqu8HgBaj9AFdBDbV0s6TtKXbG+z/We2X52fbnzE9mbby0va/9j2GWV9bLB9Up1xwva78qM4D9n+sm3njy23fXFJ23l5+13y+/22z82PHm2z/QPbe9v+dj7HX9ueVzbkq2zfYfsPtlfYnlXS/9ts35LPY43tA8vm+V7bt0q6tcKmXCTpf9qeXbJssbLfjT+S9GeSFBErI2I4IgYj4t8j4sYqpVku6dqI+EBE3JOv+0BEfD4iVpXM6zW2b8iPtP3C9vNLHrvT9gdt35gfXbvE9lNKHl9q+z7b99p+W9nzspvtf7J9t+37bX/Vdk/+2CLb99j+sO3fS/pGlW0A0CKELqCDRMTxkv5D0hkRsUdE/FbSo8qO5vRKerWkd9tekq/yHWVHdSRJtp+r7EjZlQWGe42kF0o6XNIblIWVok6V9GZlR44OkfRLZSFgL0m3SDqnrP3rJS2U9AJJJ0p6Wz7fJZI+IukkSfsq2/aVZesukXS0pOeWTyIifiHpvnz9EW+W9J2I2C7pt5KGbV9k+5W259TZrr+S9L1aDWy/QNKFkt4paW9JX5N0he3dSpq9QdIJkg6S9HxJp+frniDpg5JeLunQfLxSn1YWFI+Q9Gxl9f14yePPUFbjAyW9o862AGgyQhfQ4SKiPyI2RsSO/AjNSkl/mT/8fUlHlBwd+mtJl0dExdNnZc6PiK0Rcbeka5T9oS/qGxFxe0Q8rOyI0u0R8ZM86Fwq6ciy9p+OiC35WJ/XzqD4TknnRcQt+bqfKtse5Y9viYjBKnP5lvJTjPnpxBOVHQFTRDyi7LRgSLpA0oO2r7C9X5W+9pH0+5E7ts/Ij2Zts31Bvvjtkr4WEWvzo2cXKTtdeUxJP1+IiHsjYoukH2hnbd+Q1+6miHhU2ZG1kbGc9/3+fHv/lNfj1JJ+d0g6JyKeqFEPAC1C6AI6nO2jbV+TX9j9sKR3KQsHyv8wX6mdf5hPlfTtgl3/vuT2Y5L2aGBa95fcHqxwv7yvzSW375K0f377QEn/Lw82WyVtkWRlR3gqrVvJtyQdZ7tP0smSbouI9SMP5oHu9Ih4lqTn5WN/vkpff5T0zJJ1vxQRvXn77pI5nzUy53zec0u2Sape2/01vhYj9pU0W9L1Jf3+OF8+4sH8WjYAU4DQBXS+70i6QtLciNhT0leVBZMRKyWdZvvFknqUHbWajEeV/fEf8YxJ9idloWTEAcouVJeyAPLOiOgt+enJTxuOiFod50fP/kPZUb43Kwth1dr+RtI3lYWvSn6qsacqK9ks6ZNlc54dEeWnRSu5T+NrMeIPygLrYSX97hkRpQG2Zi0AtBahC+h8T5W0JSIet/0iSW8se/zflB19+YSkSyJixyTHu0HSy2wfYHtPScsm2Z8kLbU9x/ZcSe9T9q5DKQuQy2wfJkm297R9ygT6v0jSGcreaTh6pM/2c2yfNfJxD/n4p0m6rko/yyX9he3P5kfOZHsfSX9e0uYCSe/Kj0Da9u75mx2eWmCe35V0uu3n5hf/j177lj9vF0j6nO2n52P32W7kWjsALUToAjrfeyR9wvaflF1U/d3SB/Prty5XdlH2dyY7WERcpSwU3SjpemUfuTBZ/5r3dYOy06Ffz8f6vrKLx1fZfkTSTZJeOYH+L5M0R9JPI+K+kuV/UnYR/lrbjyoLWzdJOqtSJ/kbF46R9CxJG/KaX6vsyNz/zdusU3bt1ZckPSTpNuUXytcTET9Sdqry6ny9q8uafDhffl1ej59Iml+kbwCt5wiONgMAALQaR7oAAAAS4BOJAYxj+y+UfZTDOGUXZgMACuL0IgAAQAKcXgQAAEigrU4v7rPPPjFv3ryWjvHoo49q9913b+kYnYA6FUOdiqNWxVCnYqhTMdSpuInU6vrrr/9DROxbv2WmrULXvHnztG7dupaO0d/fr0WLFrV0jE5AnYqhTsVRq2KoUzHUqRjqVNxEamX7rvqtduL0IgAAQAKELgAAgAQIXQAAAAkQugAAABIgdAEAACRA6AIAAEiA0AUAAJAAoQsAACABQhcAAEACbfWJ9K007+wrJUlnLdiu0/PbkObM7tYTQ8N6bGjHmOVLnz+st3/kSpUtrmmWpCLNZ1nabZdZenxoh/bv7dHSxfMlSSvWbNLA1kF12RqO0JzZ3Xp8aFiD+STmzO7WOa89bEzbEV22jjl4ju7846Du3To42u+SI/u0ev3AuL7L5/PGow/QuUsWVJxvpfX78v57K7QrH38imtlXtX57Z3crQnp4cKhlYzTS78dWb9TKtZs1HKEuW6cdPbfqc9LOptN2rF4/oOVX3Kytg0OSdr7GmrEfNNu4/erw4ameEtCwGRG65hGyqnrosaGKy3dENBS4pGKBK+tbo0FqYOugll66QbI0NJyFoZFQVD63hx4b0lmXbtAsSUM7xgan4Qhde/uW0fsDWwe17PKNWnfXFn3v+gENDg2P6bt8Phdfd7ckjfvjuHr9gJZdvnHc+iP9n/eSrortRh6X1PAfsGb2Vavf0vq2aozSfntrrPex1RtHnwMpq3O156SdTaftWL1+QEsv3TDmtfTQY0NaetkGSZPbD5qt0n418NCwVq8faKt5AvVwehFTbmhHjAaueoZ3xLjAVc3g0LBWrt08+ou6npVrN49btmLNpqrrDw4N6/6HH6/abnBoWCvWbCo0dr0xJ9pXvX5TjFGk30q1r7W8XU2n7VixZlPF19LQcEx6P2i2SvvVjmi/eQL1ELrQ0Sod2Wqk7b0lpzAreXJ4R8129dZvZJ2J9NXo+q0ao16/1Z6nRp6/djCdtqPWczLZ/aDZWvWaAFIjdKGjddmTart/b0/NdXbtmlWzXb31G1lnIn01un6rxqjXb7XnqZHnrx1Mp+2o9ZxMdj9otla9JoDUCF2Yct2zrO6uYn+UumZZ3bOKte3p7tJpR89VT3dXofanHT133LKli+dXXb+nu0v77fmUqu16urtG3yTQiGb2Va/fFGMU6bdS7Wstb1fTaTuWLp5f8bXU3eVJ7wfNVmm/muX2mydQz4wIXXee/+qpnkLbmjO7W7O7x+8Gs2xVWFxT0eazLPV0z5Il9fX2aMUph2vFyYerL/9f68hRgTmzu9VTMok5s7v1mVMO14pTdrYd0WXr2EP2Ul9vz2i/5520QOcuWaDzTlowru/y+bzpmMrvXlxyZF/F9Uf67+3pHteudPyJXOTbzL5q9Ttndrd6e7pbOkbRfs9dskBvOuaA0fp22VWfk3Y2nbZjyZF9WnHK4aP7sJTtEytOPrztLk6vtF/1zelpu3kC9Tja6FqDhQsXxrp161o6Rn9/vxYtWtTSMToBdSqGOhVHrYqhTsVQp2KoU3ETqZXt6yNiYdH2M+JIFwAAwFQjdAEAACRA6AIAAEiA0AUAAJAAoQsAACABQhcAAEAChC4AAIAECF0AAAAJELoAAAASIHQBAAAkQOgCAABIgNAFAACQAKELAAAgAUIXAABAAoQuAACABAhdAAAACRC6AAAAEiB0AQAAJEDoAgAASIDQBQAAkAChCwAAIAFCFwAAQAKELgAAgAQIXQAAAAkQugAAABIgdAEAACRA6AIAAEiA0AUAAJAAoQsAACABQhcAAEAChC4AAIAECF0AAAAJELoAAAASIHQBAAAkQOgCAABIgNAFAACQAKELAAAgAUIXAABAAoQuAACABAhdAAAACRC6AAAAEiB0AQAAJEDoAgAASIDQBQAAkAChCwAAIAFCFwAAQAKELgAAgAQIXQAAAAkQugAAABIgdAEAACRA6AIAAEiA0AUAAJAAoQsAACABQhcAAEAChC4AAIAECF0AAAAJELoAAAASIHQBAAAkQOgCAABIgNAFAACQAKELAAAgAUIXAABAAoQuAACABAhdAAAACRC6AAAAEiB0AQAAJEDoAgAASIDQBQAAkAChCwAAIAFCFwAAQAKELgAAgAQIXQAAAAkQugAAABIgdAEAACRA6AIAAEiA0AUAAJAAoQsAACABQhcAAEAChC4AAIAECF0AAAAJELoAAAASIHQBAAAkQOgCAABIgNAFAACQAKELAAAgAUIXAABAAoQuAACABAhdAAAACRC6AAAAEiB0AQAAJEDoAgAASIDQBQAAkAChCwAAIAFCFwAAQAKELgAAgAQIXQAAAAkQugAAABIgdAEAACRA6AIAAEiA0AUAAJAAoQsAACCBuqHLdpftn6SYDAAAQKeqG7oiYljSY7b3TDAfAACAjrRLwXaPS9po+ypJj44sjIgzWzIrAACADlM0dF2Z/wAAAGACCoWuiLjIdo+kAyJiU4vnBAAA0HEKvXvR9msl3SDpx/n9I2xf0cqJAQAAdJKiHxmxXNKLJG2VpIi4QdJBLZoTAABAxykaurZHxMNly6LZkwEAAOhURS+kv8n2GyV12T5U0pmSftG6aQEAAHSWoke6/lbSYZKekLRS0iOS/q5VkwIAAOg0Rd+9+Jikj+Y/AAAAaFDN0GX7B6px7VZEvK7pMwIAAOhA9Y50/VP+70mSniHp4vz+aZLurLWi7QslvUbSAxHxvEnMEQAAYNqrGboi4meSZPsfIuJlJQ/9wPbP6/T9TUlfkvStSc2wSeadnX2g/lkLtuv0s/lw/Xqmsk67zLK27xh/gLWvt0dLF8/Xuru2aOXazRqOUJetg/edrdsffFTlq1jSXx9zgBYeuJdWrNmkga2D6rI1HLXfeNvb063lrztMS47sq9rmY6s36tvX3a0PlNVpzuxunfPaneuuXj+gFWs26d6tg9q/t0fHPWdfXfObB0fvL108v+Y4lfpoZJ3Sbe4rW/djqzeO1nFE3wTn2Igic2vmOK3ajnYZE8Xw3KAdFH334r62D46IOyTJ9kGS9q21QkT83Pa8yU2vOeYRsqaVSoFLkga2DuoDl9ygHSXLhiN06wOPVmwfki6+7m59Z+3do4GsXuCSpK2DQ1p66QZJqvhL+WOrN+ri6+6uuO5Djw1p6WUbRu8vu3yjBoeGR+dfut7A1kEtu3xj1XGk7A9FeR+NrjOyzaXrrrtrS8VtmMgcG1Fkbq0Yp9n9t8uYKIbnBu2i6LsX3y+p33a/7X5J14h3L2IK7KjfZPw6E/hEuaEdoRVrKn/j1cq1m2uvO5ytu2LNptFf8tUMDg1XHUdSxT4msk75uvW2oZHxGlFkbq0ap5n9t8uYKIbnBu3CUeB//pJkezdJz8nv/iYiniiwzjxJP6x1TZftd0h6hyTtt99+R61atarQfBqxcWDn57ru1yPdP9j0IToOdcos6Ntz3LJW7E+VxikfqxnrTEa18Yratm2bfvdw7RDajHGkidWtXcbctm2b9thjj2ZMqaM1Uqep2B/aBftTcROp1XHHHXd9RCws2r6R0PUSSfNUckoyImper1UkdJVauHBhrFu3rtB8GlF6evGsBdv1mY1Fz6rOXNQpu77p2rOPH7f8kGX/NnparFqd+np7JGWnMSY6jiQde/7VFfuYyDql6/7+4ccLnWotMl5R/f39+uh1O+rObbLjSBOrW7uM2d/fr0WLFjVxZp2pkTpNxf7QLtifiptIrWw3FLqKfuH1vyh7J+NLJb0w/yk8CNAsRc+Hj1nHja/TPctaunh+xcdOO3pu7XW7snWXLp6vnu6umm17uruqjiOpYh8TWad83Xrb0Mh4jSgyt1aN08z+22VMFMNzg3ZR9FDGQknPjaKHxSTZXilpkaR9bN8j6ZyI+HrjU5y8O89/NRfTTyPt/u7Fc5cskCR9u8KF6OXvXpQ0qXcvjjzWyLuuStep9g7BkTap371YZG7NHifVu9WmYkwUw3ODdlHo9KLtSyWdGRH3tXIyrTq9WIpDrcVQp2KoU3HUqhjqVAx1KoY6FZfi9GLRI137SPpv279S9v2LkvhEegAAgKKKhq7lrZwEAABApyv6hdc/s32gpEMj4ie2Z0uqfYUwAAAARhV99+LbJV0m6Wv5oj5Jq1s1KQAAgE5T9B3475V0rKRHJCkibpX09FZNCgAAoNMUDV1PRMSTI3ds76Lsq+0AAABQQNHQ9TPbH5HUY/vlki6V9IPWTQsAAKCzFA1dZ0t6UNJGZd+TeGVEfLRlswIAAOgwNUOX7RNtvzcidkTEBZIOVPbp9B+xfXKSGQIAAHSAeke6PiTpipL7u0o6StnX+7y7RXMCAADoOPU+p2vXiNhccv8/I2KLpC22d2/hvAAAADpKvSNdc0rvRMQZJXf3bf50AAAAOlO90LU2/2DUMWy/U9KvWjMlAACAzlPv9OL7Ja22/UZJ/5UvO0rSbpKWtHJiAAAAnaRm6IqIByS9xPbxkg7LF18ZEVe3fGYAAAAdpOgXXl8tiaAFAAAwQUU/HBUAAACTQOgCAABIgNAFAACQAKELAAAgAUIXAABAAoQuAACABAhdAAAACRC6AAAAEiB0AQAAJEDoAgAASIDQBQAAkAChCwAAIAFCFwAAQAKELgAAgAQIXQAAAAkQugAAABIgdAEAACRA6AIAAEiA0AUAAJAAoQsAACABQhcAAEAChC4AAIAECF0AAAAJELoAAAASIHQBAAAkQOgCAABIgNAFAACQAKELAAAgAUIXAABAAoQuAACABAhdAAAACRC6AAAAEiB0AQAAJEDoAgAASIDQBQAAkAChCwAAIAFCFwAAQAKELgAAgAQIXQAAAAkQugAAABIgdAEAACRA6AIAAEiA0AUAAJAAoQsAACABQhcAAEAChC4AAIAECF0AAAAJELoAAAASIHQBAAAkQOgCAABIgNAFAACQAKELAAAgAUIXAABAAoQuAACABAhdAAAACRC6AAAAEiB0AQAAJEDoAgAASIDQBQAAkAChCwAAIAFCFwAAQAKELgAAgAQIXQAAAAkQugAAABIgdAEAACRA6AIAAEiA0AUAAJAAoQsAACABQhcAAEAChC4AAIAECF0AAAAJELoAAAASIHQBAAAkQOgCAABIgNAFAACQAKELAAAgAUIXAABAAoQuAACABAhdAAAACRC6AAAAEiB0AQAAJEDoAgAASIDQBQAAkAChCwAAIAFCFwAAQAKELgAAgAQIXQAAAAkQugAAABIgdAEAACRA6AIAAEiA0AUAAJAAoQsAACABQhcAAEAChC4AAIAECF0AAAAJELoAAAASIHQBAAAk0NLQZfsE25ts32b77FaOBQAA0M52aVXHtrskfVnSyyXdI+nXtq+IiP9u1Zi1zDv7SknSWQu26/T8Nqqb7nWypO4u68nhGL0fVdp2z5K2hxQhddk67ei5OnfJgjFtVq8f0Io1mzSwdXBMX2ct2K4LLvil7vzjoAa2DqrL1nCE+np7NG/vHl13x0MajpAlzd61S489Oaz9e3u0dPF8LTmyr+HtKp1H6Vgj/a1eP6DlV9ysrYNDo+v09nRr+esOGzNevX4qjXnv1sHRuUsaM86c2d0657WHTWibJrL9pXNp9ZjorLp30rZgvHZ/flsWuiS9SNJtEXGHJNleJelESclD17xpHB4wMSGNBq6R+9UM7dh5ezhCF193tySNBq/V6we07PKNGhwartjXtbdvGbO+JA1szUJY6fiPPjk8+tiyyzdKUkO/DMrnUTrWsss3at1dW3TJrzZraMfYGW4dHNLSSzeMjlevn9J5lbcd2DqopZdu0HCESod56LEhLb1sw5h1m63SXCZSRzSmk+reSduC8abD89vK04t9kjaX3L8nXwa0vZVrd+66K9ZsGn0RN8vg0LBWrNnU0Dq15jE4NKyVa8cHrhFDO2J0vHr9lM6rUtuhHWMD1+jy4Wh4mxpRaS4TqSMa00l176RtwXjT4fl1RK1jAJPo2D5F0uKI+D/5/TdLelFE/G1Zu3dIeock7bfffketWrWq6XPZOPDw6O39eqT7B2s0hiTqJEkL+vaUNHb/KTfZOo2MUUSteTQyXpF+imx7vXXLbdu2TXvssUfD/Y2oNZdG6tjuJlunZmvXuk+kTu26La3UbvtTK032+Z1IrY477rjrI2Jh0fatDF0vlrQ8Ihbn95dJUkScV22dhQsXxrp165o+l9LTi2ct2K7PbGzlWdXOMNPr1GXr9vNeJUk69vyrx5wqLDWZOvX19ujas48v3L7WPCSNXptVb7x6/ZTOq17bWuuW6+/v16JFiwr3Va7aXBqtY7ubbJ2arV3rPpE6teu2tFK77U+tNNnndyK1st1Q6Grl6cVfSzrU9kG2d5V0qqQrWjge0DSnHT139PbSxfPV093V1P57urtGL0gvqtY8erq7dNrRc9U9yxUf757l0fHq9VM6r0ptu2dZlYbp7nLD29SISnOZSB3RmE6qeydtC8abDs9vy0JXRGyXdIakNZJukfTdiLi5VePVcuf5r56KYTGFLGnXLo+5X033LMl5gy5bbzrmgDHvXlxyZJ/OO2mB+np7KvZ17CF7jT7WlXfU19ujYw/Za/S+Je2+a5ecP3beSQsavrCzfB6lY5130gKdu2SBVpxyuHp7uses19vTrRWnHD46Xr1+SudV2nZk7itOOVyffcMRY8aZM7tbK04+vKUXq1aay0TqiMZ0Ut07aVsw3nR4flt2enEiWnV6sdRMOtQ6GdSpGOpUHLUqhjoVQ52KoU7FTffTiwAAAMgRugAAABIgdAEAACRA6AIAAEiA0AUAAJAAoQsAACABQhcAAEAChC4AAIAECF0AAAAJtNUn0tt+UNJdLR5mH0l/aPEYnYA6FUOdiqNWxVCnYqhTMdSpuInU6sCI2Ldo47YKXSnYXtfIR/bPVNSpGOpUHLUqhjoVQ52KoU7FpagVpxcBAAASIHQBAAAkMBND1z9P9QSmCepUDHUqjloVQ52KoU7FUKfiWl6rGXdNFwAAwFSYiUe6AAAAkpsxocv2CbY32b7N9tlTPZ9UbN9pe6PtG2yvy5ftZfsq27fm/87Jl9v2F/Ia3Wj7BSX9vDVvf6vtt5YsPyrv/7Z8XaffyomxfaHtB2zfVLKs5bWpNka7qlKn5bYH8v3qBtuvKnlsWb7Nm2wvLlle8TVo+yDba/N6XGJ713z5bvn92/LH56XZ4omxPdf2NbZvsX2z7ffly9mnStSoE/tUCdtPsf0r2xvyOv19vrzhbWtW/dpVjVp90/bvSvapI/LlU/fai4iO/5HUJel2SQdL2lXSBknPnep5Jdr2OyXtU7bsHyWdnd8+W9Kn89uvkvQjSZZ0jKS1+fK9JN2R/zsnvz0nf+xXkl6cr/MjSa+c6m1uoDYvk/QCSTelrE21Mdr1p0qdlkv6YIW2z81fX7tJOih/3XXVeg1K+q6kU/PbX5X07vz2eyR9Nb99qqRLproWder0TEkvyG8/VdJv83qwTxWrE/vU2O22pD3y292S1ub7SUPb1sz6tetPjVp9U9LJFdpP2WtvphzpepGk2yLijoh4UtIqSSdO8Zym0omSLspvXyRpScnyb0XmOkm9tp8pabGkqyJiS0Q8JOkqSSfkjz0tIn4Z2R73rZK+2l5E/FzSlrLFKWpTbYy2VKVO1ZwoaVVEPBERv5N0m7LXX8XXYP6/xeMlXZavX17zkTpdJul/jPzvsh1FxH0R8V/57T9JukVSn9inxqhRp2pm5D6V7xfb8rvd+U+o8W1rZv3aUo1aVTNlr72ZErr6JG0uuX+Par/IO0lI+nfb19t+R75sv4i4T8p+AUp6er68Wp1qLb+nwvLpLEVtqo0x3ZyRH5q/sOSQeqN12lvS1ojYXrZ8TF/54w/n7dtefmrnSGX/42afqqKsThL71Bi2u2zfIOkBZQHgdjW+bc2sX9sqr1VEjOxTn8z3qc/Z3i1fNmWvvZkSuir9T2amvG3z2Ih4gaRXSnqv7ZfVaFutTo0u70TUZqyvSDpE0hGS7pP0mXx5M+s0LWtoew9J35P0dxHxSK2mFZbNmH2qQp3Yp8pExHBEHCHpWcqOTP15pWb5v82q07Sq0YjyWtl+nqRlkp4j6YXKThl+OG8+ZTWZKaHrHklzS+4/S9K9UzSXpCLi3vzfByR9X9kL9/78cKnyfx/Im1erU63lz6qwfDpLUZtqY0wbEXF//ktuh6QLlO1XUuN1+oOyQ/u7lC0f01f++J4qfppzStjuVhYkvh0Rl+eL2afKVKoT+1R1EbFVUr+y648a3bZm1q/tldTqhPxUdkTEE5K+oYnvU0177c2U0PVrSYfm78jYVdlFhldM8Zxazvbutp86clvSKyTdpGzbR96V8VZJ/5rfvkLSW/J3dhwj6eH8cOkaSa+wPSc/5P8KSWvyx/5k+5j8OoC3lPQ1XaWoTbUxpo2RXzK51yvbr6Rs20519k6qgyQdquwC1Iqvwfz6iGsknZyvX17zkTqdLOnqvH1byp/nr0u6JSI+W/IQ+1SJanVinxrL9r62e/PbPZL+Stn1b41uWzPr15aq1Oo3JWHIyq61Kt2npua1F23wzoMUP8rerfBbZefEPzrV80m0zQcre0fKBkk3j2y3snP2P5V0a/7vXvlyS/pyXqONkhaW9PU2ZRdg3ibpb0osb1DKAAADUklEQVSWL8x35NslfUn5B+5Ohx9JK5WdxhhS9j+Z/52iNtXGaNefKnX6l7wON+a/dJ5Z0v6j+TZvUsm7Wau9BvP99Fd5/S6VtFu+/Cn5/dvyxw+e6lrUqdNLlZ1yuFHSDfnPq9inCteJfWpsnZ4vaX1ej5skfXyi29as+rXrT41aXZ3vUzdJulg73+E4Za89PpEeAAAggZlyehEAAGBKEboAAAASIHQBAAAkQOgCAABIgNAFAACQAKELAAAgAUIXgKRsn2n7Ftvfnuq5VGN7nu2b6rcEgOJ2qd8EAJrqPco+oPF3Uz2RVrG9S+z8wmAAkMSRLgAJ2f6qsk+7vsL2h23/wvb6/N/5eZu1tg8rWaff9lFV+ltu+8K8zR22z8yXjzlSZfuDtpeX9Pc52z/Pj7i90Pbltm+1fW5J97vYvsj2jbYvsz07X/8o2z+zfb3tNSVfNdJv+1O2fybpfU0tHICOQOgCkExEvEvZF8UeJ+krkl4WEUdK+rikT+XNVkl6gzT6fXz7R8T1Nbp9jqTFyr7M9hxnX6Zcz5MR8TJJX1X2XWnvlfQ8Safb3jtvM1/SP0fE8yU9Iuk9ed9flHRyRBwl6UJJnyzptzci/jIiPlNgDgBmGE4vApgqe0q6yPahyr6LbyQsfVfSVZLOURa+Lq3Tz5UR8YSkJ2w/IGm/AmOPfOH9Rkk3R/aFtrJ9h6S5krZK2hwR1+btLpZ0pqQfKwtnV2Xfe6suZd9LOeKSAmMDmKEIXQCmyj9IuiYiXm97nqR+SYqIAdt/tP18Sf9L0jvr9PNEye1hZb/XtmvskfynVFlnR9n6O7Tz92L5F9OGsi/KvTkiXlxlLo/WmSuAGYzTiwCmyp6SBvLbp5c9tkrShyTtGREbJ9D3/ZKebntv27tJes0E+jjA9ki4Ok3Sf0raJGnfkeW2u0uvPwOAWghdAKbKP0o6z/a1yk7TlbpM0qnKTjU2LCKGJH1C0lpJP5T0mwl0c4ukt9q+UdJekr4SEU9KOlnSp21vkHSDpJdMZI4AZh5HlB9BBwAAQLNxpAsAACABLqQH0PZs/43Gf/bVtRHx3qmYDwBMBKcXAQAAEuD0IgAAQAKELgAAgAQIXQAAAAkQugAAABIgdAEAACTw/wEzlVZSyh1PWAAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 720x360 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "count     18836.000000\n",
       "mean       4413.461563\n",
       "std       12468.532705\n",
       "min           0.000000\n",
       "25%          13.000000\n",
       "50%         482.500000\n",
       "75%        3375.500000\n",
       "max      341621.000000\n",
       "Name: fav_number, dtype: float64"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#继续探索其他维度的特征\n",
    "drawScatter(df['fav_number'],y,'fav_number')\n",
    "\n",
    "df['fav_number'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "#从直观的角度来看分布，0-7000左右，差异不大\n",
    "#从7000以后，female的表现大于male，大于brand，分布还是有差距的\n",
    "#可以作为重点参考特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAl8AAAFOCAYAAAC43Xi+AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAHElJREFUeJzt3XmcXGWd7/HPL51OCEQIMZjRsAQjV9SJomTcgkxQR1yHXF8uICqKitf16mWuNy5XcWRGx1HnzuK9jowL6AzBDS8aB0W0dURBg4IBEVkMhGVgNCwmQMjymz/O6VBpu7qrkzpPdVd/3q9XvbrqqbM89evTlW+e55yqyEwkSZJUxoxed0CSJGk6MXxJkiQVZPiSJEkqyPAlSZJUkOFLkiSpIMOXJElSQYYvSZKkggxfUh+JiPUR8cz6/rsi4p86XO+0iPh8s73bua+hiHhtl7d5cERsioiBNs8Xe32lRcTiiMiImNnrvkjqjOFL6lOZ+ZeZ2dWQM1ll5o2ZOTczt09kvYhYFBHbImLJKM+dGxEfqe8fFxGXRcTdEfGbiLgwIhaPsd1lEfH1iLgjIu6MiF9ExF9ExP4TfW2S+o/hS9KU0MTITmbeDFwIvGLEvuYDzwXOjIhHAGcBpwL7AYcC/xfY0aafTwWGgIuAwzNzHvBsYBvwuG6/hj3haJnUG4YvqU+1TrW1TE2dFBE31qM3726z3mBEnB0RX46IWWNsf6Ce2rwuIn4XEZdGxEH1c0+NiJ9ExF31z6e22caMiHhPRNwQEbdHxFkRsd+IPr8mIm4EvjNGX3aZeouIQyPie3W/LgAWjFGqMxkRvoDjgSszcx1wBPDrzLwwK7/LzC9n5o1ttvdh4DOZ+cHMvA12jsy9LzOHWvp8ckRcVY+OfTMiDml5LiPiv0XENfXzH4+IqJ8biIiP1L/D64HnjajFfhHxqYi4NSJujojTh6djI+JVEXFRRPxNRGwEThujLpIaYviSppejgEcCzwDeGxGPan0yIuYAXwW2AC/JzPvH2Nb/AE6gGiHaFzgZuKceNVoD/B3wYOBjwJqIePAo23hVfTsGeDgwF/iHEcv8MfAo4NhOXyTwL8ClVKHrA8BJYyx7LrAgIo5qaXsF1WgXwE+Bw+vAckxEzG23oYjYB3gK8OWxOhcRK4F3AS8EDgD+DTh7xGLPB/6IarTsJTzw+l9XP/d4YBnwohHrnUk1yvaIeplnAa3Tz08CrgceAvzFWP2U1AzDlzS9vD8z783My4HL2XUabF/gfOA64NUdnD/1WuA9mXl1PSJ0eWb+lmok5prM/FxmbsvMs4FfAi8YZRsnAh/LzOszcxPwTuD4EdNhp2Xm5sy8t5MXGBEHU4WW/52ZWzLz+8DX2i1fb/eLwCvr9Q8DjqQKcGTm9cAKYBHwBeA3EfHZNiFsf6r31X9v6c+H6/O+NkfEe+rm1wMfzMyrMnMb8JfAEa2jX8CHMvPOeoTtu1QjcFAFsf+TmRsycyPwwZZ9LQSeA7ytrtntwN9QjeQNuyUz/77+3XRUU0ndZfiSppd/b7l/D9VI07AnA4+l+kc/O9jWQVRBbaSHATeMaLuBKryMt+wNwExgYUvbhg76MnKbd2Tm5hHbHcuZwEsiYi+qUa/z6+ACQGZenJkvycwDgKcBRwOjTdveQXUu2ENb1n1Hfd7XuVSvDeAQ4G/rUHYnsBEIdq1Ru9/Vw9i1Jq2v7RBgELi1Zdv/SDXKNWyi9ZTUZYYvScO+RTWKcmE9gjKeDcDvXSUI3EIVAlodDNzcwbIHU02Z3dbS1kkQbHUrsH89Bdi63bYy89+A3wLHAS/ngSnH0Zb9CfAV4A9HeW4zcAnVdOJYNgCvz8x5Lbc5mfnDcdaD6vUd1PK49bVtoJoyXtCy3X0z8zGt3exgH5IaZPiStFNmfphquu3CiBjrJHWAfwI+EBGHReWx9Xld3wD+S0S8LCJmRsRLgUcDXx9lG2cDb69PkJ9LNf12Tj0Vt7uv4QZgLfD+iJhVn8s12pTnSGcBfwXMo2WaMiKOiojXRcRD6seHA38KXNxmO+8ATo6IVS3rHEh1leSwTwDvjIjH1M/vFxEv7vAlfgF4a0QcGNVHV6wafiIzb6UK0R+NiH3rCxqWRMQfd7htSQUYviTtIjM/QHXS/bfrk+fb+RhVEPgWcDfwKWBOfd7X86k+muG3VGHk+Zn5m1G28Wngc8D3gV8D9wFv6cLLeBnVieUbgfcxxkhWi7OoRpHOycwtLe13UoWtdRGxieq8uHOprmr8PZn5A+DpVFOTv6qn/s6n+viJv6+XOZcq6K2OiLuBK6jO1erEGcA3qc7Z+ynVKFyrVwKzgF9QTYN+iZZpUEm9F52d2iFJkqRucORLkiSpIMOXpLYi4l+j+s7Ekbd39aAvJ7bpy5Wl+yJJe8JpR0mSpIIc+ZIkSSpoUn2p6oIFC3Lx4sWN7mPz5s3ss88+4y+o3WJ9m2V9m2V9m2V9m2V9mzVafS+99NLf1B++PCGTKnwtXryYtWvXNrqPoaEhVqxY0eg+pjPr2yzr2yzr2yzr2yzr26zR6hsR4317xqicdpQkSSrI8CVJklSQ4UuSJKkgw5ckSVJBhi9JkqSCDF+SJEkFGb4kSZIKMnxJkiQVZPiSJEkqaFJ9wn2TFq9aA8CpS7fxqvr+VBLAnn4FegQ84oB9uOb2zd3o0qj2pL4BzJwBW3d0t0/9ZCL1HYhgwdxBbvvd/Q33qt3+YUZMnd/n3oMzeOPh90/J94epYqq+/04Vpy7dxhln/Ih/ft1Tdra956vr+JdLbmTHBP8BmT1zBgMB97T5A95rILhv+57+q1TG+g89r9dd+D3TYuRrcR/8sXfjEM+k0eC1p5Kp8w/1VLA9s2fBq9r/1Pp93rN1R1f+zqReuui6jZx4xo+AKnh9/uKJBy+ALdt2tA1ewJQJXjA5M8C0CF+SJE0XF123EYCzL9nQ456oHcOXJEl9aHtOndGp6cbwJUlSHxqI6HUX1IbhS5KkPrJ8yXwATnjSQT3uidqZFuFrMl7pMFHd+P9LBBz2kH26sKVmBDA4LY7IMgYiWPigWT3c/9T6fe49OKMrf2dSLy1fMn/n1Y6nr1zKy598MDN248CePXMGe4/xB7zXwNT5a5mMGWDafNTEcPGHhoZYf+KK3namj1nfZlnfZlnfZlnfZg0NDfGWFU/Zpe30lUs5feXSHvVI7Uyh/5dKkiRNfYYvSZKkggxfkiRJBRm+JEmSCjJ8SZIkFWT4kiRJKsjwJUmSVJDhS5IkqSDDlyRJUkGGL0mSpIIMX5IkSQUZviRJkgoyfEmSJBVk+JIkSSrI8CVJklSQ4UuSJKkgw5ckSVJBhi9JkqSCDF+SJEkFGb4kSZIKMnxJkiQVZPiSJEkqyPAlSZJUkOFLkiSpIMOXJElSQYYvSZKkggxfkiRJBRm+JEmSCjJ8SZIkFWT4kiRJKsjwJUmSVJDhS5IkqSDDlyRJUkGGL0mSpIIMX5IkSQUZviRJkgoyfEmSJBVk+JIkSSrI8CVJklSQ4UuSJKkgw5ckSVJBhi9JkqSCDF+SJEkFGb4kSZIKMnxJkiQVZPiSJEkqyPAlSZJUkOFLkiSpIMOXJElSQYYvSZKkggxfkiRJBRm+JEmSCjJ8SZIkFWT4kiRJKsjwJUmSVJDhS5IkqSDDlyRJUkGGL0mSpIIMX5IkSQUZviRJkgoyfEmSJBVk+JIkSSrI8CVJklSQ4UuSJKkgw5ckSVJBhi9JkqSCDF+SJEkFGb4kSZIKMnxJkiQVZPiSJEkqyPAlSZJUkOFLkiSpIMOXJElSQYYvSZKkggxfkiRJBRm+JEmSCjJ8SZIkFWT4kiRJKsjwJUmSVJDhS5IkqSDDlyRJUkGGL0mSpIIMX5IkSQUZviRJkgoyfEmSJBVk+JIkSSrI8CVJklSQ4UuSJKkgw5ckSVJBhi9JkqSCDF+SJEkFGb4kSZIKMnxJkiQVZPiSJEkqyPAlSZJUkOFLkiSpIMOXJElSQYYvSZKkggxfkiRJBRm+JEmSCjJ8SZIkFTRu+IqIgYj4donOSJIk9btxw1dmbgfuiYj9CvRHkiSpr83scLn7gHURcQGwebgxM9/aSK8kSZL6VKfha019kyRJ0h7oKHxl5pkRMQc4ODOvbrhPkiRJfaujqx0j4gXAZcD59eMjIuK8JjsmSZLUjzr9qInTgCcCdwJk5mXAoQ31SZIkqW91Gr62ZeZdI9qy252RJEnqd52ecH9FRLwMGIiIw4C3Aj9srluSJEn9qdORr7cAjwG2AGcDdwNva6pTkiRJ/arTqx3vAd5d3yRJkrSbxgxfEfE1xji3KzP/tOs9kiRJ6mPjjXx9pP75QuAPgM/Xj08A1o+1YkR8Gng+cHtm/uEe9FGSJKlvROb4Fy1GxPcz8+jx2kY8fzSwCTir0/C1bNmyXLt2bSeLTtjiVdUH9J+6dBsfXdfpdQaaKOvbrOlQ331nD3D3lu092fd0qG8vNVXfeXMG2bp9B5vv373jZtZAcP/28f8t3HtwBi888kC+fOlN3Lt1R9vl9t97kEc/9EH88LqNY34swKJ5c/ifxz6SlY9fxFd/djOnnXcld967dUJ9X75kPgAXXbexK/UdnAFPPHT+uH2fStZ/6Hld2c7Q0BArVqzYpS0iLs3MZRPdVqe/pQMi4uGZeX29s0OBA8ZaITO/HxGLJ9qhJgwHL0mTX6+Cl6auiQaWkToJXgD3bN3B5y++cdzl7rhnKxddt3Hc5W6+817e+ZV1rL1hI+f8eANbd0w87nSyn4nYuqP72+y1xavWdC2AdUun4evtwFBEXF8/Xgy8vpEeSZI0Tdy7dTtnX7KB7R3MQql/dDTtCBARs4HD64e/zMwtHayzGPj6WNOOEXEKcArAwoULj1y9enVH/ZmIdTc/8PmwC+fAbfd2fReqWd9mWd9mWd9mWd9mWd/2li7ab4+3sWnTJubOnbtL2zHHHLNb044TCV9PpRrx2jlalplnjbPOYsYJX62aOuerddrRczqaZX2bZX2bZX2bZX1HNxDRlZEv69teN6Ydu3nOV6dfrP05qisfjwL+qL5NeGeSJOkBcwYHOOFJBzE4I3rdFRXU6SfcLwOWZ+YbM/Mt9e2tY60QEWcDPwIeGRE3RcRr9rSzu2uynWgnqb19Zw/0uguaYubNGWSfWbt/3Mwa6Cz47D04g5c/+WDmDI79T+f+ew+yfMl8xtvqonlz+OALl3L6yqX89Ysfx7w5gx32+AHLl8zfecVjNwzOoKO+TyWTMQN0/N2OVJ/zdWunG87ME3arRw0ZLv7Q0BDrT1zR2870MevbLOvbLOvbrH6p7+krl3Z9mysfv4iVj1+0R9vol/pOB52GrwXALyLix1Tf7wj4CfeSJEkT1Wn4Oq3JTkiSJE0XnX6x9vci4hDgsMz8dkTsDXhihiRJ0gR1erXj64AvAf9YNy0CvtpUpyRJkvpVp1c7vglYDtwNkJnXAA9pqlOSJEn9qtPwtSUz7x9+EBEzoW++c1OSJKmYTsPX9yLiXcCciPgT4IvA15rrliRJUn/qNHytAv4DWEf1PYxrMvPdjfVKkiSpT40ZviLiuIh4U2buyMwzgEOoPu3+XRHxoiI9lCRJ6iPjjXy9Aziv5fEs4EhgBfCGhvokSZLUt8b7nK9Zmbmh5fEPMnMjsDEi9mmwX5IkSX1pvJGv/VsfZOabWx4e0P3uSJIk9bfxwtcl9Qes7iIiXg/8uJkuSZIk9a/xph3fDnw1Il4G/LRuOxKYDaxssmOSJEn9aMzwlZm3A0+NiKcDj6mb12TmdxrvmSRJUh/q9Iu1vwMYuCRJkvZQpx+yKkmSpC4wfEmSJBVk+JIkSSrI8CVJklSQ4UuSJKkgw5ckSVJBhi9JkqSCDF+SJEkFGb4kSZIKMnxJkiQVZPiSJEkqyPAlSZJUkOFLkiSpIMOXJElSQYYvSZKkggxfkiRJBRm+JEmSCjJ8SZIkFWT4kiRJKsjwJUmSVJDhS5IkqSDDlyRJUkGGL0mSpIIMX5IkSQUZviRJkgoyfEmSJBVk+JIkSSrI8CVJklSQ4UuSJKkgw5ckSVJBhi9JkqSCDF+SJEkFGb4kSZIKMnxJkiQVZPiSJEkqyPAlSZJUkOFLkiSpIMOXJElSQYYvSZKkggxfkiRJBRm+JEmSCjJ8SZIkFWT4kiRJKsjwJUmSVJDhS5IkqSDDlyRJUkGGL0mSpIIMX5IkSQUZviRJkgoyfEmSJBVk+JIkSSrI8CVJklSQ4UuSJKkgw5ckSVJBhi9JkqSCDF+SJEkFGb4kSZIKMnxJkiQVZPiSJEkqyPAlSZJUkOFLkiSpIMOXJElSQYYvSZKkggxfkiRJBRm+JEmSCjJ8SZIkFWT4kiRJKsjwJUmSVJDhS5IkqSDDlyRJUkGGL0mSpIIMX5IkSQUZviRJkgoyfEmSJBVk+JIkSSrI8CVJklSQ4UuSJKkgw5ckSVJBhi9JkqSCDF+SJEkFGb4kSZIKMnxJkiQVZPiSJEkqyPAlSZJUkOFLkiSpIMOXJElSQYYvSZKkggxfkiRJBRm+JEmSCjJ8SZIkFWT4kiRJKsjwJUmSVJDhS5IkqSDDlyRJUkGGL0mSpIIMX5IkSQUZviRJkgpqNHxFxLMj4uqIuDYiVjW5L0mSpKkgMrOZDUcMAL8C/gS4CfgJcEJm/qLdOsuWLcu1a9c20p/Fq9YAcOrSbXx03cxdnlu+ZD5X3vI77rx3ayP7nk5Gq6+659Sl2/iHq2axZduOXnelL3n8Nsv6Nsv6jm79h57Xle0MDQ2xYsWKXdoi4tLMXDbRbTU58vVE4NrMvD4z7wdWA8c1uL+2hoNXOxddt9HgpSnD4CVJnRsvA/RCk+FrEbCh5fFNdZskSdK01eS044uBYzPztfXjVwBPzMy3jFjuFOAUgIULFx65evXqrvdl3c137by/cA7cdm/Xd6Ga9W2W9W2W9W2W9W2W9W1v6aL99ngbmzZtYu7cubu0HXPMMbs17djk5PBNwEEtjw8Ebhm5UGZ+EvgkVOd8jZxP7YZXtQw5OifeLOvbLOvbLOvbLOvbLOvb3voTV+zxNkY752t3NTnt+BPgsIg4NCJmAccD5zW4P0mSpEmvsfCVmduANwPfBK4CvpCZVza1v7GMd6XD8iXzmTdnsFBvpD0ze6YfzydJnerW1Y7d1Oj4ZGZ+A/hGk/vo1HDxh4aGujL8qNFZ32YNDQ1xtfVtjMdvs6xvs6zv1OF/oSVJkgoyfEmSJBVk+JIkSSrI8CVJklSQ4UuSJKkgw5ckSVJBhi9JkqSCDF+SJEkFGb4kSZIKiszsdR92ioj/AG5oeDcLgN80vI/pzPo2y/o2y/o2y/o2y/o2a7T6HpKZB0x0Q5MqfJUQEWszc1mv+9GvrG+zrG+zrG+zrG+zrG+zullfpx0lSZIKMnxJkiQVNB3D1yd73YE+Z32bZX2bZX2bZX2bZX2b1bX6TrtzviRJknppOo58SZIk9cy0CV8R8eyIuDoiro2IVb3uz1QVEesjYl1EXBYRa+u2+RFxQURcU//cv26PiPi7uuY/j4gn9Lb3k1NEfDoibo+IK1raJlzTiDipXv6aiDipF69lMmpT39Mi4ub6OL4sIp7b8tw76/peHRHHtrT7HjJCRBwUEd+NiKsi4sqI+O91u8dvF4xRX4/fLoiIvSLixxFxeV3f99fth0bEJfWxeE5EzKrbZ9ePr62fX9yyrVHr3lZm9v0NGACuAx4OzAIuBx7d635NxRuwHlgwou3DwKr6/irgr+r7zwX+FQjgycAlve7/ZLwBRwNPAK7Y3ZoC84Hr65/71/f37/Vrmwy3NvU9DfizUZZ9dP3+MBs4tH7fGPA9pG1tHwo8ob7/IOBXdQ09fputr8dvd+obwNz6/iBwSX1cfgE4vm7/BPCG+v4bgU/U948Hzhmr7mPte7qMfD0RuDYzr8/M+4HVwHE97lM/OQ44s75/JrCypf2srFwMzIuIh/aig5NZZn4f2DiieaI1PRa4IDM3ZuYdwAXAs5vv/eTXpr7tHAeszswtmflr4Fqq9w/fQ0aRmbdm5k/r+78DrgIW4fHbFWPUtx2P3wmoj8NN9cPB+pbA04Ev1e0jj9/h4/pLwDMiImhf97amS/haBGxoeXwTYx/Aai+Bb0XEpRFxSt22MDNvherNAnhI3W7dd99Ea2qtJ+7N9dTXp4enxbC+u62egnk81eiBx2+XjagvePx2RUQMRMRlwO1Uof864M7M3FYv0lqrnXWsn78LeDC7Ud/pEr5ilDYv89w9yzPzCcBzgDdFxNFjLGvdu69dTa31xPw/YAlwBHAr8NG63fruhoiYC3wZeFtm3j3WoqO0Wd9xjFJfj98uycztmXkEcCDVaNWjRlus/tm1+k6X8HUTcFDL4wOBW3rUlyktM2+pf94OnEt1sN42PJ1Y/7y9Xty6776J1tRaT0Bm3la/6e4AzuCBKQLrO0ERMUgVDP45M79SN3v8dslo9fX47b7MvBMYojrna15EzKyfaq3VzjrWz+9HdUrDhOs7XcLXT4DD6isYZlGdKHdej/s05UTEPhHxoOH7wLOAK6hqOXx10knA/6/vnwe8sr7C6cnAXcNTERrXRGv6TeBZEbF/PQXxrLpNoxhx7uF/pTqOoarv8fVVTYcChwE/xveQUdXnu3wKuCozP9bylMdvF7Srr8dvd0TEARExr74/B3gm1Xl13wVeVC828vgdPq5fBHwnqzPu29W9vV5fbVDqRnWVza+o5nPf3ev+TMUb1ZUyl9e3K4frSDXnfSFwTf1zft0ewMfrmq8DlvX6NUzGG3A21dTBVqr/Qb1md2oKnEx1oue1wKt7/bomy61NfT9X1+/n9RvnQ1uWf3dd36uB57S0+x7y+7U9imp65efAZfXtuR6/jdfX47c79X0s8LO6jlcA763bH04Vnq4FvgjMrtv3qh9fWz//8PHq3u7mJ9xLkiQVNF2mHSVJkiYFw5ckSVJBhi9JkqSCDF+SJEkFGb4kSZIKMnxJkiQVZPiSVExEbKp/PiwivtTp8g3047MR8aLxlxx3O38eEc8cpX1FRHx9T7cvqT/NHH8RSequrL6mao/DTykRMZCZ20e2Z+Z7e9EfSVObI1+SiouIxRFxRX3/VRHxlYg4PyKuiYgPj7L8goj4UUQ8b4xtviMi1kXE5RHxobrtiIi4OCJ+HhHn1l9dM3K9Z0TEz+p1Px0Rs+v29RHx3oj4AfDiNvvcOYIWEc+OiF/Wy79wd+oiaXowfEmaDI4AXgosBV4aETu/pDYiFgJrqL76Y81oK0fEc4CVwJMy83HAcIA7C/hfmflYqq9jed+I9fYCPgu8NDOXUs0GvKFlkfsy86jMXD1W5+vtnAG8AHga8AedvGhJ05PhS9JkcGFm3pWZ9wG/AA6p2wepvhvwHZl5wRjrPxP4TGbeA5CZGyNiP2BeZn6vXuZM4OgR6z0S+HVm/qrNMud02P/D6+1ck9V3tn2+w/UkTUOGL0mTwZaW+9t54HzUbcClwLHjrB9UX0A8UTHO85snsC2/KFdSRwxfkiazBE4GDo+IVWMs9y3g5IjYGyAi5mfmXcAdEfG0eplXAN8bsd4vgcUR8YgxlunEL4FDI2JJ/fiE3diGpGnC8CVpUquvMjweOCYi3thmmfOB84C1EXEZ8Gf1UycBfx0RP6c6r+zPR6x3H/Bq4IsRsQ7YAXxiN/p4H3AKsKY+4f6GiW5D0vQR1ekJkiRJKsGRL0mSpIL8kFVJU0ZELAU+N6J5S2Y+qeH9fhxYPqL5bzPzM03uV1J/ctpRkiSpIKcdJUmSCjJ8SZIkFWT4kiRJKsjwJUmSVJDhS5IkqaD/BH+0CuCrkemmAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 720x360 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#针对类型类的特征，先进行编码，编码之前构建字典\n",
    "dic_link_color = []\n",
    "df_link_color = df['link_color']\n",
    "list_link_color = df_link_color.tolist()\n",
    "\n",
    "#构建link_color字典\n",
    "for i in list_link_color:\n",
    "    if i not in dic_link_color:\n",
    "        dic_link_color.append(i)\n",
    "    \n",
    "label_link_color = preprocessing.LabelEncoder()\n",
    "label_link_color.fit(dic_link_color)\n",
    "\n",
    "df_link_color_id_tmp = label_link_color.transform(df_link_color)\n",
    "df_link_color_id = pd.DataFrame(df_link_color_id_tmp, index=df.index, columns=['link_color_id'])\n",
    "\n",
    "#查看link_color_l分布\n",
    "drawScatter(df_link_color_id,y,'link_color_id')\n",
    "#分布差异不大"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlsAAAFNCAYAAAA3oqpqAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAHYRJREFUeJzt3Xu8XXV55/HPlxDkcCnBQrEEJBYZ6iUFBEVLa4O1DRYrKaNV6tTSatUZBW0pLWhnajta6VDrtKPWy9SKN1JRzOClRi0GRJE7GpGmooAhoKgQNBAhCc/8sVdgcziXvcn55Zyc83m/Xue197o/5zkr7C9rrb1WqgpJkiS1sdN0FyBJkjSbGbYkSZIaMmxJkiQ1ZNiSJElqyLAlSZLUkGFLkiSpIcOWJElSQ4YtSZqlkrwvyRunuw5prjNsSXNAkjck+eB23uaiJJVk5+253UElOTnJJRNMf1eS948x/heS3Jvk0UkWJHlvku8m+XGS/0jyZxOsc5ck/yPJmiR3J1mX5F+T/PpU/V6SZh7DlrSDm6lhZhZ4H3Bikt1HjX8J8MmqugN4K7AH8ARgL+B5wLcmWOdHgRO6dewNPA74e+D4Ka18G7lPSVPLsCXtgJLclOTPknwNuDvJzkn2T/KxJN9PcmOSU7t5jwNeB7wwyYYkX01ybJLVfev7fJLL+4YvSbKsez/mertpOyU5I8m3kvwwyUeSPLqbfHH3ur7b7jMm+Z3+MMn13RGibyR5Sjf+CUlWJVmf5Lokz+tbZlWSl/UNP+RoVXdk7ZVJvpnkziRvT88TgHcCz+hqWz+6nqq6FFgH/Oe+9c0Dfgc4pxv1VODDVXVnVd1fVf9eVR8d5/d7NvBrwAlVdVlV3df9fKaqXtM330T9fkPX4/d3fbouyVF9049IcnU37V+AXUfV8Nwk13a9/HKSX+ib9rB9auy/lKRhGbakHddJ9I6ILADuBz4BfBVYCPwq8NokS6vqM8BfA/9SVXtU1WHApcDjk+zTfag+GTggyZ5JRoAjgS8m2Wm89XY1nAosA34F2B+4E3h7N+2Z3euCbruXjveLJHkB8AZ6R3x+it4Roh8mmd9t/7PAzwCnAB9KcugQfXouvVB0GPDbwNKquh54JXBpV9uCcZZ9f1fTVs8G5gP/2g1/BXhTkt9PcsgkdTwbuKyqbhlvhgH6Db3eLKf3d78AeFu37C7ACuADwKOB83hoUHwK8F7gFcBPA+8CLkjyqL51P7BPVdXmSX4fSQMybEk7rn+oqrVVtZFemNi3qv6qO1rybeA9wIvGWrCqfgJcSS8QHQV8DbgEOAZ4OvDNqvrhAOt9BfD6qrqlqu6lF5ie/wiOirwM+F9VdUX13FBVN3e17AGc1W3/QuCT9ELBoM6qqvVV9R3gC8DhQyz7AeBXkhzQDb+E3pGsTd3wKcCHgFcD30hyQ5LnjLOufYDvbh3orvlan+SuJD/pRg/yd7ykqj5dVVu6+g7rxj+dXhD831W1qTvCdkXfcn8IvKs7qralqs4B7u2W26p/n5I0RTxMLO241va9PwjYf9TpsHnAFydY/iJgCXBL9/5Oekeo7u2GB1nvQcDHk9zfN30LsN9QvwkcyNjXOu0PrK2q/vXfTO+oz6C+2/f+HnrhbSBV9Z0kFwP/Jcnb6B3F++W+6RvpHTX86yQ/BZwBnJfksd01Xf1+CBzSt+wdwIIkjwe+2Y0e5O84+vfZtQu3+wPrqqr6pt/c9/4g4PeSnNI3bpduua369ylJU8SwJe24+j9U1wI3VtV4p7JqjHEXAW8BvgOcRS9svYde2Np6KnCy9a4F/qCqvjR6QpKDJv0NHrqeg8cYfytwYJKd+gLXY4H/6N7fDezWN/9jhtjmWD0Zyzn0QtRt9Hpx9Zgrq/pRkr8GzqR34fvosPVvwClJDpjgVOJk/Z7IbcDCJOkLXI/lwRC7FnhTVb1pgnUM2hNJQ/A0ojQ7XA78qLvAeSTJvCRPTvLUbvr3gEXdNUFbfRk4FHgacHlVXUfv6MfRPHhx+2TrfSe9a5YOAkiyb5ITumnfp3ct2c8NUP//Bf4kyZHdBeyP79Z5Gb1A9adJ5idZAvwmvWuWAK6l943B3bojRC8dtGFdTw7ornWayMfoHXn7Sx68MB6AJP89yVPTu6XDrsBrgPXAmtErqarP0juNuSLJ0d0y83noabzJ+j2RS4HNwKnpfWHiRHp/263eA7yy23aS7J7k+CR7DrBuSdvAsCXNAt31O79J73qkG4Ef0Aswe3WznNe9/jDJ1d0ydwNXA9dV1X3d9EuBm6vq9gHX+/f0LtL+bJIf07tg/Ohu2XuANwFf6q5N6g8Vo+s/r5v3w8CP6V3o/eiurucBz+m2/Q7gJVX1792ibwXuoxeczqF3/dSgLgSuA76b5AcT1HY3Dwau0esv4J+72m6l923D46tqwzirO5HeNWcfpBfKbgReDBzXbWuyfo+r69WJwMn0jlK+EDi/b/qV9K7bels3/YZuXkmN5aGn9yVJkjSVPLIlSZLUkGFL0naR5J3dDURH/7xzumuTpJY8jShJktSQR7YkSZIamlH32dpnn31q0aJFTbdx9913s/vuo58rq4nYs+HZs+HZs+HYr+HZs+HZs4ldddVVP6iqfSebb0aFrUWLFnHllVc23caqVatYsmRJ023MNvZsePZsePZsOPZrePZsePZsYklunnwuTyNKkiQ1ZdiSJElqyLAlSZLUkGFLkiSpIcOWJElSQ4YtSZKkhgxbkiRJDRm2JEmSGppRNzVtadEZnwLgtMWbObl7P6h5gT13nc9dGzcxMn8nNm6+n/5HSi4YmU8C6+/ZxP4LRjh96aEsO2IhK65Zx9kr13Dr+o0PGS9JkqbeTP3cnRNha9GQ4Wq0LQXrN24C4J5N9z9s+tZpAOvWb+TM81dz5c138LGr1rFx05aHjAdmxB9ekqTZZMU16zjz/NUz8nPX04gNbNy0hXMvW/vAH7x//Nkr10xTVZIkzV5nr1wzYz93DVuNbOk/z9jn1vUbt3MlkiTNfuN9vs6Ez13DViPzkjHH779gZDtXIknS7Dfe5+tM+Nw1bDUwMn8eJx19ICPz5z1s/OlLD52mqiRJmr1OX3rojP3cnRNh66azjt+m5eel+8YhsNv8nRh90GrByHz23q03feGCEd584mLeuGwxbz5xMQsXjDxk/HRfpCdJ0my07IiFM/Zzd058GxEeDFyrVq3iphcv2S7bXHbEwhnxR5YkaS6YqZ+7c+LIliRJ0nQxbEmSJDVk2JIkSWrIsCVJktSQYUuSJKkhw5YkSVJDhi1JkqSGDFuSJEkNGbYkSZIaMmxJkiQ1ZNiSJElqyLAlSZLUkGFLkiSpIcOWJElSQ4YtSZKkhgxbkiRJDRm2JEmSGjJsSZIkNWTYkiRJasiwJUmS1JBhS5IkqSHDliRJUkOGLUmSpIYMW5IkSQ0ZtiRJkhoybEmSJDVk2JIkSWrIsCVJktSQYUuSJKkhw5YkSVJDhi1JkqSGDFuSJEkNGbYkSZIaMmxJkiQ1ZNiSJElqyLAlSZLUkGFLkiSpIcOWJElSQ4YtSZKkhgxbkiRJDRm2JEmSGjJsSZIkNWTYkiRJasiwJUmS1JBhS5IkqSHDliRJUkOGLUmSpIYMW5IkSQ0ZtiRJkhoybEmSJDVk2JIkSWrIsCVJktSQYUuSJKkhw5YkSVJDhi1JkqSGDFuSJEkNGbYkSZIaMmxJkiQ1ZNiSJElqyLAlSZLUkGFLkiSpIcOWJElSQ4YtSZKkhgxbkiRJDRm2JEmSGjJsSZIkNWTYkiRJasiwJUmS1JBhS5IkqSHDliRJUkOGLUmSpIYMW5IkSQ0ZtiRJkhoybEmSJDVk2JIkSWrIsCVJktSQYUuSJKkhw5YkSVJDhi1JkqSGDFuSJEkNGbYkSZIaMmxJkiQ1ZNiSJElqyLAlSZLUkGFLkiSpIcOWJElSQ4YtSZKkhgxbkiRJDRm2JEmSGjJsSZIkNWTYkiRJasiwJUmS1JBhS5IkqSHDliRJUkOGLUmSpIYMW5IkSQ0ZtiRJkhoybEmSJDVk2JIkSWrIsCVJktSQYUuSJKkhw5YkSVJDk4atJPOSfH57FCNJkjTbTBq2qmoLcE+SvbZDPZIkSbPKzgPO9xNgdZLPAXdvHVlVpzapSpIkaZYYNGx9qvuRJEnSEAYKW1V1TpIR4LFVtaZxTZIkSbPGQN9GTPKbwLXAZ7rhw5Nc0LIwSZKk2WDQWz+8AXgasB6gqq4FHteoJkmSpFlj0LC1uaruGjWuproYSZKk2WbQC+S/nuR3gHlJDgFOBb7crixJkqTZYdAjW6cATwLuBc4FfgS8tlVRkiRJs8Wg30a8B3h99yNJkqQBTRi2knyCCa7NqqrnTXlFkiRJs8hkR7b+tns9EXgM8MFu+CTgpokWTPJe4LnA7VX15G2ocUosOqN3T9bTFm/m5DOGuz/r7rvMY/68nVi/cRPzErZUsXDBCKcvPZRlRyycdPk/X7Gacy9by5Yq5iWcdPSBvHHZ4kf0e4xlxTXrOHvlGm5dv5H9h6hLmoncnyXNNhOGraq6CCDJ/6yqZ/ZN+kSSiydZ9/uAtwHv36YKp8CiIcPVaHfftwXYAsCW6h3oW7d+I2eevxpgwg+CP1+xmg9+5TsPDG+pemB4KgLXimvWceb5q9m4actQdUkzkfuzpNlo0Avk903yc1sHkjwO2HeiBarqYuCObahtxtu4aQtnr5z4hvrnXrZ2qPHDOnvlmgc+mIapS5qJ3J8lzUapmvx2WUmOA94NfLsbtQh4RVWtnGS5RcAnJzqNmOTlwMsB9ttvvyOXL18+SN1DWb3uwVuE7TcC39s4tetfvHCvgbY9zHKDar1+gA0bNrDHHntMybrmCns2vA0bNnDjXVvGnT5V+/Ns4T42PHs2PHs2sWOPPfaqqjpqsvkGClsASR4F/Hw3+O9Vde8AyyxikrDV76ijjqorr7xyoHqG0X8a8bTFm3nL6kFvLza5hQtG+NIZzxp3+sFnfvqBU4/95iV8682/sc3bP+asC1m3/uHpcbK6hrFq1SqWLFkyJeuaK+zZ8FatWsXrv3J/8/15tnAfG549G549m1iSgcLWoKcRAY6kd6+tw4AXJnnJIy1uthiZP4/Tlx464TwnHX3gUOOHdfrSQxmZP2/ouqSZyP1Z0mw00CGeJB8ADqb3MOqtx/mLGXDx+yBuOuv4bbpIflu+jbj1IvhW30bcun2/vaXZwP1Z0mw06Pm0o4An1qDnHIEk5wJLgH2S3AL8RVX90/AlTo2bzjoe6B0SvenFS7brtt+4bPGU3uphtGVHLPTDSLOG+7Ok2WbgZyPSu8/WbYOuuKpOekQVSZIkzSKDhq19gG8kuZze8xEB7yAvSZI0mUHD1htaFiFJkjRbDfog6ouSHAQcUlWfT7IbMG+y5SRJkua6gW79kOQPgY8C7+pGLQRWtCpKkiRpthj0PluvAo4BfgRQVd8EfqZVUZIkSbPFoGHr3qq6b+tAkp3p3WdLkiRJExg0bF2U5HXASJJfA84DPtGuLEmSpNlh0LB1BvB9YDW9h0Z/qqpe36wqSZKkWWLCsJXkhCSvqqr7q+o9wEH07ib/uiTP3y4VSpIk7cAmO7L1p8AFfcO70Hsg9RLgvzaqSZIkadaY7D5bu1TV2r7hS6rqDuCOJLs3rEuSJGlWmOzI1t79A1X16r7Bfae+HEmSpNllsrB1WXdD04dI8grg8jYlSZIkzR6TnUb8I2BFkt8Bru7GHQk8CljWsjBJkqTZYMKwVVW3A7+Y5FnAk7rRn6qqC5tXJkmSNAsM+iDqCwEDliRJ0pAGvampJEmSHgHDliRJUkOGLUmSpIYMW5IkSQ0ZtiRJkhoybEmSJDVk2JIkSWrIsCVJktSQYUuSJKkhw5YkSVJDhi1JkqSGDFuSJEkNGbYkSZIaMmxJkiQ1ZNiSJElqyLAlSZLUkGFLkiSpIcOWJElSQ4YtSZKkhgxbkiRJDRm2JEmSGjJsSZIkNWTYkiRJasiwJUmS1JBhS5IkqSHDliRJUkOGLUmSpIYMW5IkSQ0ZtiRJkhoybEmSJDVk2JIkSWrIsCVJktSQYUuSJKkhw5YkSVJDhi1JkqSGDFuSJEkNGbYkSZIaMmxJkiQ1ZNiSJElqyLAlSZLUkGFLkiSpIcOWJElSQ4YtSZKkhgxbkiRJDRm2JEmSGjJsSZIkNWTYkiRJasiwJUmS1JBhS5IkqSHDliRJUkOGLUmSpIYMW5IkSQ0ZtiRJkhoybEmSJDVk2JIkSWrIsCVJktSQYUuSJKkhw5YkSVJDhi1JkqSGDFuSJEkNGbYkSZIaMmxJkiQ1ZNiSJElqyLAlSZLUkGFLkiSpIcOWJElSQ4YtSZKkhgxbkiRJDRm2JEmSGjJsSZIkNWTYkiRJasiwJUmS1JBhS5IkqSHDliRJUkOGLUmSpIYMW5IkSQ0ZtiRJkhoybEmSJDVk2JIkSWrIsCVJktSQYUuSJKkhw5YkSVJDhi1JkqSGDFuSJEkNGbYkSZIaMmxJkiQ1ZNiSJElqyLAlSZLUkGFLkiSpIcOWJElSQ4YtSZKkhgxbkiRJDRm2JEmSGjJsSZIkNWTYkiRJasiwJUmS1JBhS5IkqaGmYSvJcUnWJLkhyRkttyVJkjQT7dxqxUnmAW8Hfg24BbgiyQVV9Y1W25zIojM+BcBpizdzcvd+UAlUwd67zacK7tq4if0XjHD60kNZdsTCoWtZcc06zl65hlvXb9ym9bTysPoO2zLdJUmStMNqeWTracANVfXtqroPWA6c0HB741o0ZLgarar3euc9m1i/cRMFrFu/kTPPX82Ka9YNta4V16zjzPNXs279xm1aTytj1bfuzo0zpj5JknY0LcPWQmBt3/At3bhZY+OmLZy9cs1Qy5y9cg0bNz30SNEjWU8rY9V3f9WMqU+SpB1Nauthm6lecfICYGlVvawb/l3gaVV1yqj5Xg68HGC//fY7cvny5VNey+p1dz3wfr8R+N7GqV3/4oV7PaJatmU9rYxV39aezYT6dhQbNmxgjz32mO4ydij2bDj2a3j2bHj2bGLHHnvsVVV11GTzNbtmi96RrAP7hg8Abh09U1W9G3g3wFFHHVVLliyZ8kL6r9E6bfFm3rJ66n7thQtGOOXFSwae//VnXci69Q9Pe8Oup5Wx6jtt8WaWr91zRtS3o1i1ahUt9uXZzJ4Nx34Nz54Nz55NjZanEa8ADknyuCS7AC8CLmi4ve1uZP48Tl966FDLnL70UEbmz9vm9bQyVn07JTOmPkmSdjTNwlZVbQZeDawErgc+UlXXtdreRG466/htWj7pve6923wWjMwn9I5EvfnExUN/i3DZEQt584mLWbhgZJvW08pY9S3ce2TG1CdJ0o6m5WlEqurTwKdbbmNQWwPXqlWruGmaT4ctO2LhjA4vo+tbtWrV9BUjSdIOzjvIS5IkNWTYkiRJasiwJUmS1JBhS5IkqSHDliRJUkOGLUmSpIYMW5IkSQ0ZtiRJkhpq9iDqRyLJ94GbG29mH+AHjbcx29iz4dmz4dmz4div4dmz4dmziR1UVftONtOMClvbQ5IrB3lCtx5kz4Znz4Znz4Zjv4Znz4Znz6aGpxElSZIaMmxJkiQ1NBfD1runu4AdkD0bnj0bnj0bjv0anj0bnj2bAnPumi1JkqTtaS4e2ZIkSdpuDFuSJEkNzZmwleS4JGuS3JDkjOmuZ6ZKclOS1UmuTXJlN+7RST6X5Jvd697TXed0SvLeJLcn+XrfuDF7lJ5/6Pa7ryV5yvRVPn3G6dkbkqzr9rVrk/xG37Qzu56tSbJ0eqqeXkkOTPKFJNcnuS7Ja7rx7mtjmKBf7mfjSLJrksuTfLXr2V924x+X5LJuH/uXJLt04x/VDd/QTV80nfXvSOZE2EoyD3g78BzgicBJSZ44vVXNaMdW1eF991Y5A/i3qjoE+LdueC57H3DcqHHj9eg5wCHdz8uBf9xONc407+PhPQN4a7evHV5Vnwbo/m2+CHhSt8w7un/Dc81m4LSqegLwdOBVXW/c18Y2Xr/A/Ww89wLPqqrDgMOB45I8Hfgbej07BLgTeGk3/0uBO6vq8cBbu/k0gDkRtoCnATdU1ber6j5gOXDCNNe0IzkBOKd7fw6wbBprmXZVdTFwx6jR4/XoBOD91fMVYEGSn90+lc4c4/RsPCcAy6vq3qq6EbiB3r/hOaWqbquqq7v3PwauBxbivjamCfo1njm/n3X7yoZucH73U8CzgI9240fvY1v3vY8Cv5ok26ncHdpcCVsLgbV9w7cw8T/CuayAzya5KsnLu3H7VdVt0PsPGvAz01bdzDVej9z3Jvbq7pTXe/tOT9uzUbrTNUcAl+G+NqlR/QL3s3ElmZfkWuB24HPAt4D1VbW5m6W/Lw/0rJt+F/DT27fiHdNcCVtjJW/veTG2Y6rqKfROSbwqyTOnu6AdnPve+P4ROJje6YvbgLd04+1ZnyR7AB8DXltVP5po1jHGzbm+jdEv97MJVNWWqjocOIDekb0njDVb92rPHqG5ErZuAQ7sGz4AuHWaapnRqurW7vV24OP0/vF9b+vpiO719umrcMYar0fue+Ooqu91/6G/H3gPD57CsWedJPPpBYcPVdX53Wj3tXGM1S/3s8FU1XpgFb3r3RYk2bmb1N+XB3rWTd+LwS8PmNPmSti6Ajik+4bFLvQuirxgmmuacZLsnmTPre+BXwe+Tq9Xv9fN9nvA/5ueCme08Xp0AfCS7ptiTwfu2noKaK4bdT3Rb9Hb16DXsxd133x6HL0Lvi/f3vVNt+5amH8Crq+qv+ub5L42hvH65X42viT7JlnQvR8Bnk3vWrcvAM/vZhu9j23d954PXFjeGX0gO08+y46vqjYneTWwEpgHvLeqrpvmsmai/YCPd9c77gx8uKo+k+QK4CNJXgp8B3jBNNY47ZKcCywB9klyC/AXwFmM3aNPA79B7+Lbe4Df3+4FzwDj9GxJksPpnYa4CXgFQFVdl+QjwDfofcPsVVW1ZTrqnmbHAL8LrO6uqQF4He5r4xmvXye5n43rZ4Fzum9h7gR8pKo+meQbwPIkbwSuoRdi6V4/kOQGeke0XjQdRe+IfFyPJElSQ3PlNKIkSdK0MGxJkiQ1ZNiSJElqyLAlSZLUkGFL0rRK8tokuzVc/7LpfhZqkiVJfnE6a5A0fQxbkprr7v003n9vXgs0C1v0nus23Q+eXwIYtqQ5yrAlqYkki5Jcn+QdwNXA7ya5NMnVSc5LskeSU4H9gS8k+UKS307yd93yr0ny7e79wUku6d4fmeSi7vmdK/vupn5wks9047+Y5Oe7o0nPA85Ocm2Sg8ep9fFJPp/kq119B3cB8ewkX0+yOskLu3mXJPlk37JvS3Jy9/6mJH/ZrWN1V8Mi4JXAH3U1/HKDdkuawebETU0lTZtD6d1c838A5wPPrqq7k/wZ8MdV9VdJ/hg4tqp+kOQxwOndsr8M/DDJQuCXgC92j2P5P8AJVfX9LgC9CfgD4N3AK6vqm0mOBt5RVc9KcgHwyar66AR1fgg4q6o+nmRXev8jeiK95+kdBuwDXJHk4gF+5x9U1VOS/DfgT6rqZUneCWyoqr8dtHGSZg/DlqSWbq6qryR5Lr1TeV/qnlCwC3Dp6Jmr6rvdEa896T2D7cPAM+kFr/PphbcnA5/r1jMPuC29hw//InBeNx7gUYMU2G1rYVV9vKvhJ934XwLO7e4q/r0kFwFPBSZ6GDRdnQBX0QtskuY4w5aklu7uXgN8rqpOGmCZS+kdDVsDfJHeUatnAKcBjwWuq6pn9C+Q5KeA9VV1+COoMUOO38xDL8HYddT0e7vXLfjfWEl4zZak7eMrwDFJHg+QZLck/6mb9mNgz755Lwb+pHu9BjgWuLeq7qIXwPZN8oxuPfOTPKmqfgTcmOQF3fgkOWyc9T9Et+wtSZZ1yz6q+3bkxcALk8xLsi+9I2yXAzcDT+zm2wv41QF+/wlrkDS7GbYkNVdV3wdOBs5N8jV64evnu8nvBv41yRe64S/SO4V4cXcKby1wSbee+4DnA3+T5KvAtTz4Lb8XAy/txl8HnNCNXw6cnuSa8S6Qp/cA41O72r4MPAb4OPA14KvAhcCfVtV3q2ot8JFu2ofoBcLJfAL4LS+Ql+YmH0QtSZLUkEe2JEmSGvLiTUlzRpK3A8eMGv33VfXP01GPpLnB04iSJEkNeRpRkiSpIcOWJElSQ4YtSZKkhgxbkiRJDRm2JEmSGjJsSZIkNfT/AWeVLkDqn7OaAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 720x360 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#retweet_count被关注次数\n",
    "drawScatter(df['retweet_count'],y,'retweet_count')\n",
    "\n",
    "#brand与female还有male存在一定程度上的分布差异"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlsAAAFOCAYAAACxNtjEAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3XucHXV9//H355zMhrMrZRMIVNaEWEyD0rhBAsRia2KxwYIQkRiRaLVeelGRFmPBSwXFgo8VjH1ordVawQuggimCFVSI/rwQDdeIgoAiIUHBhuWSrLDZ/fz+mJmTObNzLnv5nt1sXs/HYx+7Zy7f+c7n+53vfHZmzjnm7gIAAEAYpcmuAAAAwHRGsgUAABAQyRYAAEBAJFsAAAABkWwBAAAERLIFAAAQEMkWAABAQCRbwB7IzN5tZp9pMP9+MzuuhXLmm5mb2YyJreHEMLNlZvZggHL/w8ze12C+m9lzJnq7U4GZfc7Mzp/segB7E5ItYA/k7v/q7m+a7Hrsqdz979z9g6Ndz8w+ZWaXFkx/vpk9ZWazzazbzD5rZr8xsyfM7Bdm9s8Nyuwws38xs7vNbIeZbTWz/zWzvxxt/QBMTSRbACbEVLs6ZmblAMV+TtIpZtaVm/46Sde4+3ZJH5X0DEnPlbSfpJMk3degzK9KOjkpY5akZ0v6mKQTJrTm4zTV2hfYk5BsAVOYmf1zcqXjieTKx18k0881sy9klnutmf3azP7PzN6TK6NkZmeb2X3J/C+b2ezcpv7GzLaZ2UNmdlZm3aPN7Edm1p/M+7iZdWTmu5m91czukXRPk3053My+ZWbbzey3ZvbuZPpMM1uXbH9b8vfMOmU818w2JPW508xOysz7nJl90sy+YWY7JC1vUJeaW2lmtjbZv21m9jf11nP3H0naKumVmXXLkl4j6ZJk0lGSvuTuj7r7sLvf5e5frVOP4yS9VNLJ7r7R3Z9Ofr7p7u/ILHewmV1pZo+Y2a/M7IzMvHOTNr006Sd3mtmSzPwjzOyWZN4VkvbJ1eFEM7stiekPzez5mXn3J33wDkk7SLiAsSHZAqYoM1so6W2SjnL3fSWtkHR/wXLPk/RJSa+VdLCk/SU9K7PIGZJWSnpxMv9RSZ/IFbNc0gJJfynp7MzzXkOS/lHSAZJeKOkvJP1Dbt2Vko6R9LwG+7KvpG9L+mZSh+dI+k4y+z2SlkpaLKlX0tGS3ltQRiTp65Kul3SgpLdL+mISp9RrJH1I0r6Svl+vPrlyj5f0TsVJzwJJzZ51u1TxVajUcZIiSf+bvL5J0ofM7A1mtqBJWcdJ2ujudZ9LM7OS4v2+XVKP4jY408xWZBY7SdLlkrolXS3p48m6HZLWS/q8pNmSvqLaRPEFkj4r6W8V95tPSbo6l+yepvgqW7e772qyPwAKkGwBU9eQpJmSnmdmkbvf7+5Ft6NOVXwL63vu/pSk90kazsz/W0nvcfcHk/nnSjo1d5XiPHff4e6bJf234hOs3P1md7/J3Xe5+/2KT8Yvzm3/Anff7u4DDfblREm/cfeL3P337v6Eu29M5p0u6QPu/rC7PyLpPMWJY95SxbfnLkyu/twg6Zq0ron/cfcfJFeUft+gPlmvkvTf7v5Td9+hOD6NfF7Si80sTWhfp/hK1mDy+u2Svqg4Uf6Zmd1rZi+rU9YBkn6Tvkie+eo3s8fMLK3/UZLmuPsHkv3+paRPS3p1ppzvu/s33H0oqV9vMn2p4kRwnbsPJlfYfpJZ782SPpVcVRty90skPZWsl/o3d9/SpH0BNECyBUxR7n6vpDMVn/wfNrPLzezggkUPlrQls94OSf+XmX+IpK8lJ/F+ST9XnMgdlFlmS+bvXydlysz+2MyuSR72flzSvypOEFRn3Xrmqv5zSwcn2xyx/YLltrj7cG7ZnlHWpbDcXJl1ufsDkr4naY2ZPUPxlb1LMvMHkjcwHKn4atGXJX2l4NatFLfTMzPrbnf3bklHKk60pbj9Dk7bL2nDd6u2/X6T+XunpH2SZPpgSVvd3evs3yGSzsqVPVe18R9LTAFkkGwBU5i7f8ndX6T4pOiSPlyw2EOKT5CSJDPrVHyST22R9DJ378787OPuWzPLzM38PU/StuTvT0q6S9ICd/8DxSd5y1ezhV3ZIunQOvO2Kd6/ou3nl5ub3FbLLpvdj1bqklcTv6TMZi5RfEXrlZJ+5e63FC3k7mmC2qX4wfe870g6KnOVrMiWZBvZ9tvX3f+qhXo+JKnHzLJtlt2/LZI+lCu7090vy+5GC9sB0ADJFjBFmdlCM3tJ8vzM7yUNKL4ilfdVSSea2YuSZ3Q+oNpj+z8UP0N0SFLuHDM7OVfG+8ys08wOl/QGSVck0/eV9LikJ83sMEl/P8bduUbSH5rZmckD8fua2THJvMskvTep1wGS/kXSFwrK2Chph6R3mVlkZsskvVzxs0rj8WVJrzez5yWJ6vtbWOdKxQnaecpc1ZIkM3ufmR1l8Uc67CPpHZL6Jd2dL8Tdr5d0o6T1ZnZMsk6k2tt4P5b0ePKgesXMymb2J2Z2VAv1/JGkXZLOMLMZZnaK4mfiUp+W9HfJts3MuszshOQZOwAThGQLmLpmSrpQ0u8U3yY6UPGVpRrufqekt0r6kuIrGY9Kyj5w/THFD01fb2ZPKH6A+5hcMd+VdK/iKy0fSZIAKX5w/DWSnlB8Yr5CY+DuTyh+AP3lyb7co93vFjxf0iZJd0jaLOmWZFq+jKcVPwj+MsUx+XdJr3P3u8ZSp0y5/ytpnaQbFMfghhbW2aHdCdcX87MVP/f2O8VX414q6QR3f7JOcacoTka/oDgp+5Xi59iOT7Y1pDhui5N5v5P0GcUfK9Gsnk8n5b9ecb9YLemqzPxNip/b+ngy/95kWQATyGpv5QMAAGAicWULAAAgIJItABPCzP7MzJ4s+pmk+txZpz6nT0Z9AOy9uI0IAAAQEFe2AAAAAppS33N1wAEH+Pz584NuY8eOHerqyn+HLEIi5u1HzCcHcW8/Yt5+xHy3m2+++XfuPqfZclMq2Zo/f742bdoUdBsbNmzQsmXLgm4DtYh5+xHzyUHc24+Ytx8x383MGn7jRIrbiAAAAAGRbAEAAAREsgUAABAQyRYAAEBAJFsAAAABkWwBAAAERLIFAAAQEMkWAABAQCRbAAAAAU2pT5APaf7Z10qSzlq0S69P/u7prmhb/4C6OyO5S48NDOrg7orWrliolUf0VNddf+tW9V13t7b2D6hspiF39STLSVLfdXdrW/9Aw3XT+csPm6Mb73qk7vLN1KvL/P0r+uF925V+rXhXR1kfesWimrLTGGT1NKhDUd2vveMhPbpzUJLUGZXUMaNcjVu6b1v7B2RStS5nLdqlt/7LN7Xz6aHqPkvSOVfdoYHB4er2oiT1z0zSzBklPbUrMyGx4MAu7Xx6uKb9+gcGZSal363eXYl0Yu8zq3UqmTSczKtEJe0TldW/c7CwXfKvOztKuufhHU1jl43ZPlFc92GXymY67Zi5On/loqZxXrtioTb9ersu27hFQ5kvis9ur2idojYsavNUdyVS/8DgiOnZGGbbMRvPeseNtPt4yM+v1z+6OsqKyiX1DwxW+/Ss3Lpb+wfq7kfWPmXT/vvuUxOXbJ1mlGr7V9mkIR9Zj2xfaWbN0nnVmLyrd0hnnne9HhsY1H6VSGZS/85BdXdG1eMmb1ZnpPe//HBJ0rlX31ltk3R6o/Ek2+7rb91ad/38vFak/SNtk4mQ9uEzr7htxLx1qxeP2Ld3fvk27SrYdPZY6B8Y1BEfuL5ufKV4rJoZlfXozvr7s271YkmNx/N6mrVLOi/tE4/urO1j2eMha83SeSPGASlum3NPitv29E//SD+4b3vd+c3qmh3rsn12rPtfdNxnzxuNxpT8MZs9z3Xn6rb8sDn62i1btePpoab1u//CE5ouE5r5BB1EE2HJkiUe4ut6siecsxbt0kWbG+eYlaisC05ZVB2kzrlqswYGRzZoVDbJpcHMqNzqukXLN9NKeVnlkumiVb1aeURPw5NuUR1Gu61G8jGPyqbBoanT78ZjtO29Zum8moSraJ1GJ/pKVNYrj+zRlTdvrVkn34YbNmzQ67+5o7iQQKKSSaYp1bZFx2hIrYwvRUoW/85XMyqb+k7trdu/0naXpLVfuX3EfkZl0+qj5uqKH29pWwzaJT0W/nDnr/SRO8oTUmZUsrrjeT3N2mWixtGius4/oHPEP4HZ+X3J+N+oro3U2//s1/VM5LlCCnfMhkq4zOxmd1/SbDluIxYYGBxS33V3S4qz63qdaHDIR3SIVtctWr6ZVsrLGhr2lsouqsNotzUaU+lkPF6jbe/LNm6peV20TqMxZmBwSJdt3DJindH0o1AGh33KtW3RMToVDXtxuw8OecP+lbZ733V3F+7n4JDrso3TL9GSdh8LE3nBoNF4Xk+zdgk2jg573UQrnT/ecX2s+z8ee8oxO1p7zW3E0dqW3LbY1uLti/GsO9HLTUTZY9nW3mo07Z2/HTCWONe7pUObTU/N+lezdp+oW4BTUTv2rVl8x9ou7TAR4/pY9x+1uLJVx8HdlZrfIded6OUmouyxbGtvNZr2LpsVrjsa+TLGUxamvmb96+DuSsO2r9dfpoN27Fuz42qs7dIOEzGuj3X/UYtkq0AlKlcf0lu7YqEqUfHzAFHZ4udUxrBu0fLNtFJeVrlkLZVdVIfRbms0ovL0GfxH296nHTO35nXROqUG4alEZZ12zNwR64ymH4USlWzKtW3RMToVlay43aOyNexfabuvXbGwcD+jcvzGjD0hBqOVHgs2gQlXo/G8nmbtEmwcLZkWHNjVcP54x/Wx7v947CnH7GjtFclWvQfjerorMsXv2umuRLJkWvaBwJVH9OiCUxapJ8ne0/+keror6ju1V32reqvlNFo3nb9m6by6yzfTqC7HHjpb2e7Z1VGuPhzfLAZFdahX91mdUXWZzqhUE7d03yQpf6h0dZSry/Wd2qt1qxerEtV2v6i0+x2JqZkzirvoggO7RrSfFL+TLtVdiWrqlD1+K1FJszpH1r3e66JBrVl7V6JSdZtlsxEPx9eL88WvWqw1S+eN+K893d75KxeNWKeoDZs9EJrGLC+72WwNsvEsOm76VvWq79TeuvPr9Y+ujnK1Luk+59dt1T5lq4lL/hjN969sbpitx2jG+mxMZpSsWu/uSlTtY9njJm9WZ6SLX7VYF79qcU2bzOqMqg/HS8V9JW33lUf0qG9Vb+H6569cNGJeK/JtMhF6uivVd/3lrVu9uGbf1q1erBl1Np09Fp41q9IwvlI8VqXL1NufdasXNxzP62nWLtl5aZ+QavtYvQgXjQNKyulb1atv/dMyHXvo7LrzWx3X8/Ubz/4XHffZ9mk0puSPWWl3e+XrtmbpPHV1tJbk8W7EnFDvRszKvosC7UHM24+YTw7i3n7EvP2I+W68GxEAAGAKINkCAAAIiGQLAAAgIJItAACAgEi2AAAAAiLZAgAACIhkCwAAICCSLQAAgIBItgAAAAIi2QIAAAiIZAsAACAgki0AAICASLYAAAACItkCAAAIiGQLAAAgIJItAACAgEi2AAAAAiLZAgAACIhkCwAAICCSLQAAgIBItgAAAAIi2QIAAAiIZAsAACAgki0AAICASLYAAAACItkCAAAIiGQLAAAgIJItAACAgEi2AAAAAiLZAgAACIhkCwAAICCSLQAAgIBItgAAAAIi2QIAAAiIZAsAACAgki0AAICASLYAAAACItkCAAAIiGQLAAAgIJItAACAgEi2AAAAAiLZAgAACIhkCwAAICCSLQAAgIBItgAAAAIi2QIAAAiIZAsAACAgki0AAICASLYAAAACItkCAAAIiGQLAAAgIJItAACAgEi2AAAAAiLZAgAACIhkCwAAICCSLQAAgIBItgAAAAIi2QIAAAiIZAsAACAgki0AAICASLYAAAACItkCAAAIiGQLAAAgIJItAACAgEi2AAAAAiLZAgAACIhkCwAAICCSLQAAgIBItgAAAAIi2QIAAAiIZAsAACAgki0AAICASLYAAAACItkCAAAIiGQLAAAgIJItAACAgEi2AAAAAiLZAgAACIhkCwAAICCSLQAAgIBItgAAAAIi2QIAAAiIZAsAACAgki0AAICASLYAAAACItkCAAAIiGQLAAAgIJItAACAgEi2AAAAAiLZAgAACIhkCwAAICCSLQAAgIBItgAAAAIi2QIAAAiIZAsAACAgki0AAICASLYAAAACItkCAAAIiGQLAAAgIJItAACAgEi2AAAAAmqabJlZ2cy+3Y7KAAAATDdNky13H5K008z2a0N9AAAAppUZLS73e0mbzexbknakE939jCC1AgAAmCZaTbauTX4AAAAwCi0lW+5+iZlVJM1z97sD1wkAAGDaaOndiGb2ckm3Sfpm8nqxmV0dsmIAAADTQasf/XCupKMl9UuSu98m6dmB6gQAADBttJps7XL3x3LTfKIrAwAAMN20+oD8T83sNZLKZrZA0hmSfhiuWgAAANNDq1e23i7pcElPSbpM0uOSzgxVKQAAgOmi1Xcj7pT0nuQHAAAALWqYbJnZ19Xg2Sx3P2nCawQAADCNNLuy9ZHk9ymS/lDSF5LXp0m6v9GKZvZZSSdKetjd/2QcdQQAANhjmXvzNxWa2ffc/c+bTcvN/3NJT0q6tNVka8mSJb5p06ZWFh21+WfHH4B/1qJdumjz7hzTVP/SXdlMS/9olu7/vwFt7R9Q2UxD7urprmj5YXN0412PaFv/gLo7I/1+cEgDg8OSpFmdkd7/8sO16dfb9YWbHqgpc+aMkp7aFS/XXYl07kmHa+URPQ3r/t71m/WljQ9oOKloJSrpglOe33C99bdu1T9dcZuGG5YsdUYl7RzcvVQ+Hl0dZUXlkh4bGNTB3RWtXbGwcLvrb92qc6++U/0DgyPmpTFP49ddiWQm9e+My5y/f0U/vG97dbsdZVPXzBnV+csPm6PLNj6goVxDzcrF3UxyV7V9rrn9oZr6mEmnHzNPSw6Zrb7r7ta2/oGafVp/69aa6WkbZ9s+uw+DQ8Pa8fSQpNq2zMcirVe9/V+7YqEkjdh2tv4lk4Y93md3Vdsj3w+fGhzSzsFhnbVol/79ro5qbLIqUUn7RGU9unNkW0nSutWLa9o4vz9p/87Web/cPi0/bI6uveOhuts4aN8OPbXLa9onG8OXXrxB9zy8o3DdVmSP3WbtnO3T6bz88d6sj6Txf9Nzfq++O8otx3vd6sWSpHdfdUf1OEz76fkrFxXWJ9t/upP+0D8wWJ2fPy6ysuNPMz2ZvnnmFbcV1n3Tr7frizc9UDNm5I+FfP3T4yHv/gtPqHm9/tathdvNL7dhwwYtW7ZMkvScc67VrkzZM0y694La5aXd54OsZse2tLu/Z+Oe1dVR1odesajpmJ7vR/P3r+gH920fsdyCA7tqjoOi46bIrM6o2t+6K5FO7H3miPEwf/4pOi4k6byv3zmirJ6BX+nDt5drtvf+lx8+Ytyod5wVee/6zTV9aeaMkipRufDcU1R2UV9J5fvMRDKzm919SdPlWky2fi7pBHf/ZfL62ZK+4e7PbbLefEnXTHaylT2w8slWKOnJsZmoZOpb1Vu3E753/eYRCZsUv7Ph4txJMVVvkJoIlaisC05ZNOKgWvuV2zVYZ4fbFfNW5dumEpX1yiN7dOXNWzUwODTmcqOSafXRc3XFj7fUjUW99WTSYD6bHIfxxjxNuOq1bblkKkmj2s9WRCXT7K5Iv33i6QktV6rfzmmflqRzrtpc2Ada7SMT2dePPXS2bnngsXH1yfGKyjamfpkeC6M9ptKTYrMxLHvyTJOtfKKVyidcRYlWMyVJ5RZjUS6ZLmowpq+/dWvdftZu6flHGtn3o5JpWNJQwTFe1M+jsqnv1N7quJEvr+jckap3nstq5ThtJFTC1Wqy1eq7Ef9R0gYz22BmGyTdKN6N2FCr56DBYVffdfW/AemyjVuKy5fqrteovPEaGBwaUX7fdXdP+Ek3pHxVBwaHdNnGLeMe/AaHXZdtHF2ila43kYnWREjbuF7bDg17kDYfHPYgiZZUv53TPt133d11+8BE9ZHR+MF92yf9hDzWfpkeC2Ot/1jGsKJEq9H00RhW67EYajKmN+pn7Zaef4rqNDjshYlW3bKGvGbcqHecFal3nitafyrFbzRaurIlSWY2U9Jhycu73P2pFtaZryZXtszsLZLeIkkHHXTQkZdffnlL9RmNzVt3fx7rQRXptwMTvolxW9SzX+H0bN1bXa/ZOhMhu91m25uqMZ/OJiLmi3r2a0tfmk7o6+OTjiujGfeefPJJPeMZz2i4zmjGq4ky1jF9T9ConzcbN6bCOWsiLV++fOJuI0qSmf2ppPnKPFTv7pc2WWe+9tLbiKPR013RD85+SeG8Q8/5hobqtFG99Y698AZt7Q834ue322x7UzHmeenzGlOlnPEab8zTNg7dl9qtXvv0dFckqeG+ttK2e0Jfb5exHAvprZ5m/a7oNmKj24PZ5cdyG3G0Go3pU+2YaqXv59Xr583GjXpxaXSeG29dU3vEbUQz+7zidya+SNJRyU/TwvdmJWttuahk1QcRi5x2zNzi8qW66zUqb7wqUXlE+WtXLIyfO9pD5Ktaico67Zi5qkTl4hVaFJVMpx0zd9SxiEqmqDy14pe2cb22LZcsSJtHJdNB+3ZMeLlS/XZO+/TaFQvr9oGJ6iOjceyhs9u6vSJj7ZfpsTDW+o9lDJtRp6r1po9GSa3HotxkTG/Uz9otPf8U1SkqmcqjOMajstWMG/WOsyL1znNF60+l+I1Gq89sLZF0rLv/g7u/Pfk5o9EKZnaZpB9JWmhmD5rZG8db2bFqlNE26kplMx176OxqNl22eOme7orWLJ2nnu6KTPE7MSrR7lDO6ox08asWa83SeSPKnDlj93Ldlajhw/GSdP7KRVqzdF5NglCJSnUfjpeklUf0aN3qxS01bmdUu1Q+Hl0d5fjdT4r3u+gBx5VH9KhvVa+6K1HDbaXx665EmtW5u8xjD51ds92OstXMX7N0norGuXzck+Kr6+TrYyatWTpPF79qcbXt0n06f+UiXXDKoprpaRtn657dh66O3Qd82pbnr1w0IhZpvertf9+qXvWd2jti29ky0vaf1RnVtEe+H2bbsxIV94BKVNKszvptlX03YlHbzuqMdNGqXvWt2l3n/D6tWTqv4TYO2rdjRPukMdz4npdqwYFddddtRfbYbdbOaZ9eeURPdV5ahpqsm49/yUZ21EbxXrd6sdatXlzTbmk//eKbX1hYn2ys0/6QnZ8/LrKy408zPd0V9Z3aW33HZFHd1yydN2LMyB4LRfUvCJGk2nE6HcOaLZd17wUnjEisit6NWG/9Rsf2xasX1xyj2bhndXWUGz4cL6mmn2XHwCL546DouCmS7W/dlahwPMyef4rq1LeqVxet6i0sa0YuCZvVGVUfjq+3j/Uejpd2n+eypc6cUSo89xSVXa+vpEK+G7FVrb4b8SuSznD3h0JWJuRHP6SybxNGexDz9iPmk4O4tx8xbz9ivlurtxFbfbjgAEk/M7MfK/5+REl8gjwAAEAzrSZb54asBAAAwHTV6hdRf9fMDpG0wN2/bWadkva8J9QAAADarNV3I75Z0lclfSqZ1CNpfahKAQAATBetvjXlrZKOlfS4JLn7PZIODFUpAACA6aLVZOspd69+h4aZzVD9728GAABAotVk67tm9m5JFTN7qaSvSPp6uGoBAABMD60mW2dLekTSZsXfY3itu78nWK0AAACmiYbJlpmdbGZvdfdhd/+0pEMUf5r8u83s1LbUEAAAYA/W7MrWuyRdnXndIelIScsk/X2gOgEAAEwbzT5nq8Pdt2Ref9/dt0vabmbj+/IyAACAvUCzK1uzsi/c/W2Zl3MmvjoAAADTS7Nka2PygaY1zOxvJf04TJUAAACmj2a3Ef9R0noze42kW5JpR0qaKWllyIoBAABMBw2TLXd/WNKfmtlLJB2eTL7W3W8IXjMAAIBpoNUvor5BEgkWAADAKLX6oaYAAAAYA5ItAACAgEi2AAAAAiLZAgAACIhkCwAAICCSLQAAgIBItgAAAAIi2QIAAAiIZAsAACAgki0AAICASLYAAAACItkCAAAIiGQLAAAgIJItAACAgEi2AAAAAiLZAgAACIhkCwAAICCSLQAAgIBItgAAAAIi2QIAAAiIZAsAACAgki0AAICASLYAAAACItkCAAAIiGQLAAAgIJItAACAgEi2AAAAAiLZAgAACIhkCwAAICCSLQAAgIBItgAAAAIi2QIAAAiIZAsAACAgki0AAICASLYAAAACItkCAAAIiGQLAAAgIJItAACAgEi2AAAAAiLZAgAACIhkCwAAICCSLQAAgIBItgAAAAIi2QIAAAiIZAsAACAgki0AAICASLYAAAACItkCAAAIiGQLAAAgIJItAACAgEi2AAAAAiLZAgAACIhkCwAAICCSLQAAgIBItgAAAAIi2QIAAAiIZAsAACAgki0AAICASLYAAAACItkCAAAIiGQLAAAgIJItAACAgEi2AAAAAiLZAgAACIhkCwAAICCSLQAAgIBItgAAAAIi2QIAAAiIZAsAACAgki0AAICASLYAAAACItkCAAAIiGQLAAAgIJItAACAgEi2AAAAAiLZAgAACIhkCwAAICCSLQAAgIBItgAAAAIi2QIAAAiIZAsAACAgki0AAICASLYAAAACItkCAAAIiGQLAAAgIJItAACAgEi2AAAAAiLZAgAACIhkCwAAICCSLQAAgIBItgAAAAIi2QIAAAiIZAsAACAgki0AAICASLYAAAACItkCAAAIiGQLAAAgoKDJlpkdb2Z3m9m9ZnZ2yG0BAABMRebuYQo2K0v6haSXSnpQ0k8knebuP6u3zpIlS3zTpk1B6jP/7GslSWct2qWLNs+QJHVXIp170uFaeUSPJGn9rVvVd93d2to/oJJJw5nQpMtK0nlfv1OP7hysKb+7EunE3mfq2jseGjEvrxKVZJJ2Dg6Pah86o5I6ZpTVPzCospmG3GUmZZuwq6OsYXcNJGV3RiXNjMp162SSDu6uaPlhc3Ql1vyYAAAN5klEQVTjXY9oW/+A9qtEenrXULV++TJNkufKqHmd1Cn9fdaiXfrsfRWd8Pza+OTjn5W2xbb+AR3cXdHaFQslqdo+1f3PbNsknb50npYcMru6bndnpN8PDlXr3lE2DQ57tX4zTMo2Q75t0vj17xwcUY+iWKVmdUZ6/8sP1yduvEf3PLyjMPZ5nVFJLlXrmpdt63S/68XhnzL9PO/+C0+o6etFolJtXI49dLZWLZlXXSff5vk6pvWalYt/V0dZ7l6NV/a4KqpPV0dZr3hBj75w0wOF9WxFUV0rUUn7JO3a3RnpqcGhunVK23lwaFg7nh4q3EZXR1lRuaQ3PmdAl2/ZV2tXLNTKI3pq+nFRvGZ1Rk3HC6l5XFP1xrT0OFp+2JxRxTLd3mj0dFdqjtHuSiQz6dGdg2MqLy/fnumY3l2J1D8wMpadUanatvnxsp6ymZb+0Sz97KEnRrRPzbGWlNeTxPaa2x8qrENeen6Z1RnJXS2tk+quRHpsYLBhe5pUd2yqd67K1qmob2X3+6xFu/Rf91ZGlFNvXEjndXaUtfPpIe1XibTz6V16emjk0ml/Scfc5YfNqXteNZOeM6dL9z68o+52pXjMC8XMbnb3JU2XC5hsvVDSue6+Inl9jiS5+wX11gmVbKWJllSbbElSVDL1reqVJJ1z1WYNDBYPplJ8GdBKpqHhMDGbrvIxz0rjn0241t+6dURbRCWTTBosODjzygHbKCqb5NLgFO8DjWIuSZWo3LCvt1NJUrlsLbVtu4y1TmncK1FZrzyyR1fevLXtcR7NmDYdNOvrmHh7YsxDJVytJlsho9UjaUvm9YOSjgm4vTEZHHb1XXe3JDUdlIal2stdGLc0/tlkq++6u0e0xWiSm5DJ8FRKCMZjKp2AhyUNT7G4jrdOA4NDumzjlnFfxRmL0YxpANoj5JWtVZJWuPubktevlXS0u789t9xbJL1Fkg466KAjL7/88gmvy+atj1X/Pqgi/bb4zgkCaSXmi3r2q/6dbS+MDf18chD39iPm7bcnxjx7jplIy5cvn/QrWw9Kmpt5/SxJ2/ILuft/SvpPKb6NuGzZsgmvyOsb3EaU4vvtkuo+v4LxaXbJuae7orefvqz6+j0X3kBbjNOeeJl/OsjGfSKeTxqrvWlMo6+3354Y8/sz55jJEPLdiD+RtMDMnm1mHZJeLenqgNsbk6hkWrtiodauWKhKVG64bEnx80CYOGn8s4raIipZ/LxUC0K2UVS2+PmxPVyzvt5OJanltm2X8dapEpV12jFzJyXOoxnTALRHsGTL3XdJepuk6yT9XNKX3f3OUNtrpN6Dcd2VqPpw9sojenTBKYuq/xHmz6fdlUgXr16si1b1alZnVFjWmqXzCuflVaKSOqPRh74zKqm7EpdftriClqtnV0dZlUzZnVGpYZ1M8X/Ba5bOU093RZbsS7Z++TLzp6ARr632txS/wyUfn2z8s7Jtkdavb1Wv+k7trbZPdf9z9VizdJ4uWtVbXXdWZ1RT946y1dQv3wz5tknjV63Hqb3qy5Sfj1V2f9etXqwFB3aNmFdPZ1SqqWteNp7pn/Xi0Mj9F55Q09eL5Ktx7KGztW714uo69baR1jGtVz7+XR3lmnilx1W2bbO6Ospas3Rekz1qrKiulUy7zuqMGtYpbeeujvqJS1dHuXps9nRXdMEpi3T+ykU1/bhIK+OF1Dyu2boXjWnZ43w0yvkBpgX5Y7S7ElX3cyzl5dUrIY1/XrZtW9182UzHHjq7sH1qjrXkRRrbenXIS88vszqjltdJdVeipu3ZaGyqd67K1qmob+VDV1ROo/Ca4uMkrVtHnX9m0v6S3cd6x4mZtODArpbGvMkW7JmtsQj50Q+pDRs2KMStStRHzNuPmE8O4t5+xLz9iPlurb4bkU+QBwAACIhkCwAAICCSLQAAgIBItgAAAAIi2QIAAAiIZAsAACAgki0AAICASLYAAAACItkCAAAIaEp9gryZPSLp14E3c4Ck3wXeBmoR8/Yj5pODuLcfMW8/Yr7bIe4+p9lCUyrZagcz29TKR+tj4hDz9iPmk4O4tx8xbz9iPnrcRgQAAAiIZAsAACCgvTHZ+s/JrsBeiJi3HzGfHMS9/Yh5+xHzUdrrntkCAABop73xyhYAAEDb7DXJlpkdb2Z3m9m9Znb2ZNdnOjGzz5rZw2b208y02Wb2LTO7J/k9K5luZvZvSTvcYWYvmLya77nMbK6Z3WhmPzezO83sHcl04h6Ime1jZj82s9uTmJ+XTH+2mW1MYn6FmXUk02cmr+9N5s+fzPrvycysbGa3mtk1yWtiHpiZ3W9mm83sNjPblExjfBmjvSLZMrOypE9Iepmk50k6zcyeN7m1mlY+J+n43LSzJX3H3RdI+k7yWorbYEHy8xZJn2xTHaebXZLOcvfnSloq6a1Jnybu4Twl6SXu3itpsaTjzWyppA9L+mgS80clvTFZ/o2SHnX350j6aLIcxuYdkn6eeU3M22O5uy/OfMwD48sY7RXJlqSjJd3r7r9096clXS7p5Emu07Th7t+TtD03+WRJlyR/XyJpZWb6pR67SVK3mT2zPTWdPtz9IXe/Jfn7CcUnoh4R92CS2D2ZvIySH5f0EklfTabnY562xVcl/YWZWZuqO22Y2bMknSDpM8lrEzGfLIwvY7S3JFs9krZkXj+YTEM4B7n7Q1KcGEg6MJlOW0yw5FbJEZI2irgHldzOuk3Sw5K+Jek+Sf3uvitZJBvXasyT+Y9J2r+9NZ4W1kl6l6Th5PX+Iubt4JKuN7ObzewtyTTGlzGaMdkVaJOi/2x4G+bkoC0mkJk9Q9KVks5098cb/BNP3CeAuw9JWmxm3ZK+Jum5RYslv4n5OJnZiZIedvebzWxZOrlgUWI+8Y51921mdqCkb5nZXQ2WJe5N7C1Xth6UNDfz+lmStk1SXfYWv00vIye/H06m0xYTxMwixYnWF939qmQycW8Dd++XtEHx83LdZpb+45qNazXmyfz9NPJ2Oxo7VtJJZna/4sc/XqL4ShcxD8zdtyW/H1b8j8XRYnwZs70l2fqJpAXJO1g6JL1a0tWTXKfp7mpJf538/deS/icz/XXJu1eWSnosvSyN1iXPofyXpJ+7+8WZWcQ9EDObk1zRkplVJB2n+Fm5GyWdmiyWj3naFqdKusH5YMNRcfdz3P1Z7j5f8bh9g7ufLmIelJl1mdm+6d+S/lLST8X4MmZ7zYeamtlfKf6PqCzps+7+oUmu0rRhZpdJWqb4m+B/K+n9ktZL+rKkeZIekLTK3bcnScLHFb97caekN7j7psmo957MzF4k6f9J2qzdz7K8W/FzW8Q9ADN7vuKHgsuK/1H9srt/wMz+SPFVl9mSbpW0xt2fMrN9JH1e8fN02yW92t1/OTm13/MltxHf6e4nEvOwkvh+LXk5Q9KX3P1DZra/GF/GZK9JtgAAACbD3nIbEQAAYFKQbAEAAAREsgUAABAQyRYAAEBAJFsAAAABkWwBAAAERLIFIAgz+4yZPa9g+uvN7ONN1j3XzN4ZrnbNtVLPFss5yczOrjPvyaLpAKaXveW7EQG0mbu/aTK2m3zAorn7cNOFJ3a7MzJfjlzl7leLb6wA9mpc2QIwbsnXe1xrZreb2U/NbLWZbTCzJcn8N5jZL8zsu4q/7y5db46ZXWlmP0l+js0U22tmN5jZPWb25mT5Z5jZd8zsFjPbbGYnJ9Pnm9nPzezfJd2i2u9py9bz+GTd283sO8m02Wa23szuMLObkk+Kz693SLLdO5Lf85LpnzOzi83sRkkfrrPN6hWy5CvDfpTs6wdHHWgAeySubAGYCMdL2ubuJ0iSme0n6e+Tv58p6TxJR0p6TPH32t2arPcxSR919+8nCcx1kp6bzHu+4i967pJ0q5ldq/iLb1/h7o+b2QGSbjKz9KrRQsVfE/IPRRU0szmSPi3pz939V2Y2O5l1nqRb3X2lmb1E0qWSFudW/7ikS939EjP7G0n/JmllMu+PJR3n7kMtxOljkj7p7pea2VtbWB7ANMCVLQATYbOk48zsw2b2Z+7+WGbeMZI2uPsj7v60pCsy846T9HEzu03xrbY/SL8AV9L/uPuAu/9OcYJ2tCST9K9mdoekb0vqkXRQsvyv3f2mBnVcKul77v4rSXL37cn0Fyn+Pj25+w2S9k+SxawXSvpS8vfnk3VSX2kx0ZLiq3qXZcoBsBfgyhaAcXP3X5jZkZL+StIFZnZ9fpE6q5YkvdDdB7IT48euRqzjkk6XNEfSke4+aGb3S9onmb+jSTWtTj2sYFqzL43Nzm+23dGWDWCa4coWgHEzs4Ml7XT3L0j6iKQXZGZvlLTMzPY3s0jSqsy86yW9LVNO9vbdyWa2j5ntL2mZpJ9I2k/Sw0mitVzSIaOo5o8kvdjMnp1sK72N+D3FSZzMbJmk37n747l1fyjp1cnfp0v6/ii2m/WDXDkA9gJc2QIwERZJ6jOzYUmDip/X+ogkuftDZnau4mTnIcUPsJeT9c6Q9InktuAMxYnP3yXzfizpWknzJH3Q3beZ2Rclfd3MNkm6TdJdrVbQ3R8xs7dIusrMSoqf/3qppHMl/XdSh52S/rpg9TMkfdbM1kp6RNIbWt1uzjskfcnM3iHpyjGWAWAPY+5c0QYAAAiF24gAAAABcRsRwLRjZhslzcxNfq27bw64zTcovk2Y9QN35yMegL0ctxEBAAAC4jYiAABAQCRbAAAAAZFsAQAABESyBQAAEBDJFgAAQED/H7o3TTjN2e/4AAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 720x360 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#sidebar_color特征\n",
    "# from sklearn.feature_selection import LabelEncoder  #引入特征筛选的包\n",
    "\n",
    "#构建link_color字典dic_sidebar_color\n",
    "dic_sidebar_color = []\n",
    "df_sidebar_color = df['sidebar_color']\n",
    "list_sidebar_color = df_sidebar_color.tolist()\n",
    "\n",
    "for i in list_sidebar_color:\n",
    "    if i not in dic_sidebar_color:\n",
    "        dic_sidebar_color.append(i)\n",
    "        \n",
    "#通过字典进行label编码\n",
    "label_sidebar_color = preprocessing.LabelEncoder()\n",
    "label_sidebar_color.fit(dic_sidebar_color)\n",
    "df_sidebar_color_id_tmp = label_sidebar_color.transform(df_sidebar_color)\n",
    "df_sidebar_color_id = pd.DataFrame(df_sidebar_color_id_tmp, index=df.index, columns=['sidebar_color_id'])\n",
    "\n",
    "#调用分布绘制函数\n",
    "drawScatter(df_sidebar_color_id,y,'sidebar_color_id')\n",
    "\n",
    "#边框颜色存在分布密度不同的差异，可以重点观测该属性"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlsAAAFNCAYAAAA3oqpqAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3Xuc3HV97/HXJ8uCm6BsIpeaFQxiipVGQFOFE7XBVoNaNYfihYKWXrA361ExLSg94nnQQk21Pdb2aDnV4qVARZuDVRupGFSEKAgYvEQBoxgooCFy28KSfM4fv98sk8nM7iTZ785m9vV8POaxM9/f7/f9fuczs5t3fpeZyEwkSZJUxpxeT0CSJKmfGbYkSZIKMmxJkiQVZNiSJEkqyLAlSZJUkGFLkiSpIMOWJElSQYYtSdrLRURGxNN6PQ9J7Rm2pD4SEZsi4lenecxzI+Jj0znmroiIdRHxux2WPS4itkbEC9ss++uIuKy+/7yI+GpE/CwitkTE1RHxSxOMuTgiLomIeyLivoj4fkT8bUQ8eeqemaS9hWFL0qyVmf8FXAq8vrk9IgaAU4CLIuIJwL8BfwssAEaAdwEPt+uz3sO0HrgDODYznwAsA24FnlfmmeyeiNin13OQZgPDltQnIuKjwGHApyPigYj4k4i4KCLOrJeP1Ieb/rB+/LR6L03Uj38tIm6s9/R8NSKe2dT3woj4ZL2n5gcR8aa6/UTg7cBr6jFvmmSOCyLiwxFxR0TcGxFrmpadERG31HO6PCIW1u2L6nnv07Tu+N6qiDg9Ir4SEX9V9/mDiHhJvezPgecD76/n9/4207oI+PWImNvUtoLq7+PngJ8HyMyLM3NbZo5m5ucz85sdnua5wNWZ+dbM/HG97d2Z+TeZeUnTc5io3psi4m0R8c16b9qlEfG4puWrIuLOuo6/3VLj/epa/Cgi7oqID0TEUL1seUT8OCL+NCL+E/hwh+cgaQoZtqQ+kZmvA34EvDwz98/MdwNXAcvrVX4ZuK3+CfAC4MuZmRHxLOBDwO8BTwQ+CFxe/8M9B/g0cBPVXp1fAd4cESsy89+BvwAurcc8epJpfhSYCxwFHAz8NUB9GO984NXAk4AfApd06KOd5wIbgQOBdwP/GBGRme8Avgy8sZ7fG1s3zMyvAncCJzU1vw7458x8FPgesK0Ori+JiPmTzOVXgU9OtMJE9W5a7dXAicDhwDOB0+ttTwTeBrwIWFyP1+wvqQLiMcDTqF6z/9m0/Oeo9tA9BXjDJM9F0hQwbEn97Srg+XVgegFVEFlWL/vlejnAGcAHM3N9vffmIqrDZMcBvwQclJn/KzMfyczbgAuB1+7KRCLiScBLgN/PzHszcywzG+OfCnwoM7+RmQ8DZwPHR8SiLrv/YWZemJnbqPZUPQk4ZBem9xHqQ4n1YcNX1v2QmfdRHf5Lqud9T73nrVP/BwL/2XgQEW+s9149EBEX1s0T1bvhfZl5R2ZuoQq7x9TtrwY+nJk3Z+aDVHvSGmNF3fdbMnNLZt5PFYabX6vtwDsz8+HMHN2FGknaTYYtqY9l5q3AA1T/UD+f6tyjOyLiSHYMW08BzqxDwdaI2AocCiysly1sWfZ2di3MUPe3JTPvbbNsIdXerMa8HwB+SrVXphvj4SYzH6rv7r8Lc/sIcEJEjAAnA7dk5g1NfX4nM0/PzCcDv1jP92869PVTqrDX2Pb9mTlcrz9YN09U752eE/BQ0/NZCNzetOyHTfcPotpzeH1Tv/9etzfcU5+rJmmaeHKk1F+yTdtVVAFi38zcHBFXUe3FmQ/cWK9zO/DnmfnnrRtHxPHADzJz8S6M2c7twIKIGM7MrS3L7qAKII0x51EdXtsMPFg3zwXuq+//XJdjdjW/zPxRRHyZag/bS6jCV6d1vxsR/0R1CLCdL1AdkpzofKiO9e7CnVTBrOGwpvs/AUaBozJzc4ftu329JE0R92xJ/eUu4KktbVcBbwS+VD9eB/wx8JX6sBtUh8d+PyKeG5V5EfGyiHg88DXgvvqk6qGIGIiIX4zHPvrgLmBRfaiyo8y8k+qE87+PiPkRMRgRL6gX/zPwWxFxTH3e0l8A6zNzU2beQxW6TqvH/m3giD2sSTsXUdVpGfDxRmNEPD0izoz6Yxsi4lCqKxWv7dDPuVSHbt9b7ykjIg4EfqFpnYnqPZl/AU6PiGfUJ/W/s7EgM7fXff91RBxcjz0SESu66FdSIYYtqb+cD5xTH0J6W912FfB4HgtbX6HaS9R4TGZeR3Wuz/uBe4FbqE/IrgPZy6kORf6Aau/J/wUOqDf/RP3zpxHxjUnm9zpgDPgucDfw5nqMLwB/RnVi+Z1UYar5PKMzgFVUh+iOAr46WSGa/G/g5PpKxfdNsN5lVHv7vlAHw4b7qU7AXx8RD1KFrJuBM9t1kpnfozr36snATRFxP3A11d67P6vX6VjvyWTm56gOSV5Zb3dlyyp/WrdfGxH3Af8BHNlN35LKiEz3KEuSJJXini1JkqSCPEFe0pSKiAc6LHpJZn55WicjSTOAhxElSZIK8jCiJElSQTPqMOKBBx6YixYtKjrGgw8+yLx584qOofasfW9Y996x9r1h3XtnttX++uuv/0lmHjTZejMqbC1atIjrrruu6Bjr1q1j+fLlRcdQe9a+N6x771j73rDuvTPbah8RP5x8LQ8jSpIkFWXYkiRJKsiwJUmSVJBhS5IkqSDDliRJUkGGLUmSpIIMW5IkSQUZtiRJkgqaUR9qWtKisz4DwJlLHuX0+r7KGIhgW+b4z5HhIVatOJJhYM0Nm3nXp7/FvQ+Ntd3uqQfN5bZ7HmLbJN/ZGcBEawwPDXLUwsdz7W33si2TCBjaZw6jY9tZWM9n5bEjO2yz5obNnP2pbzI6th2AOQHHP3UBm346yh1bR9lnDtSLAFh88Dzuuf8Rto5Wz2X+3EHe+fKjduh3zQ2bWb12I3dsHWV47iCZ8LPRMRYOD3HC0w/ii9+9h81bR3eqVevcWvtqPAdgh7ZFTxwaf84NZx+zna03bB7vs10/K48d4UXvXcf3735wp3EXHzyPK966fMJ5tJvvOWs2cPH628ffC6c891CAHdrm7TuH+x7e1nGsybQb47yVS7reXrNbt+/lfrCozb97my54WQ9mMjvNqC+iXrp0aZb4BPnmN9mZSx7lPRtmTcacMYYGB/izZyfv/Hoytq3377mhwQHOP2nJDgHkrZfeyPZJtpvM4ECw+uSjWXnsSB3eNjA6tm3yDSeYW2N+rX0NzgkIJq3nmUse5e+/ux/nn1SFkNZ+hgYHeMLjBrjr/kc69tEIQe3m0W6+56zZwMeu/VH3T7rNWJPpNMZpxx02YwLXbPs07Zmim7p3+17uB+2CVsNUB67Z9p6PiOszc+lk63kYUdNidGwbWx4cmxFBC6r5rF67cfzx6rUb9zhoQRV8Gv2uXrtxl4NWu7l16mtse/fBtdFnu35Gx7ZNGLSA8T1enbZvne/F62/val4TjTWZTmPsydiaPbp9L0tTwV08mjY54YG/6XfH1tG296eq3z3ps3XbqZhfyT5a2yc7DDwVOo0xHWNr79fte1maCu7Z0rQJotdT2MHC4aG296eq3z3ps3XbqZjfwuGhPe6n0/at7QNR/rXuNMZ0jK29X7fvZWkqGLY0LYYGB1gwb5DBgZnxD+HQ4MD4CeYAq1YcOSW/DIMDMd7vqhVHMjQ4sMdz69TX4Jzoup6NPtv1MzQ4wCGP33fC7RcfPK/jPNrNt3Ey/O5ojDWZTmPsydiaPbp9L0tTYVaELa+4mF6NPQuNnyPDQ5x/0hIWDg+x+uSjmT93sON2iw+e19WeicnWGB4aZNkRC8b7ioC5g3OIpvk0nwS78tgR3vuaYxgafOxXYk7AsiMWMDI8RACDLb8tiw+ex/DQY89l/tzB8ZPjG32ef9KS8e3nzx1keGhwfA6nHXcYI/X/oltr1XqCbmtfI8NDrH7V0aw++egd2pqfc8O+A3PG+2zXz/knLWH9O17UMeQ0n7DeafvW+Z63cgmnHXfYDu+F0447bKe2J+y34z92u3I1YqcxZsrJ8ZrZun0v94NO/wb6b+P0mRVXIzabbVdKzCTWvjese+9Y+96w7r0z22rv1YiSJEkzgGFLkiSpIMOWJElSQYYtSZKkggxbkiRJBRm2JEmSCjJsSZIkFWTYkiRJKsiwJUmSVJBhS5IkqSDDliRJUkGGLUmSpIIMW5IkSQUZtiRJkgoybEmSJBVk2JIkSSrIsCVJklSQYUuSJKkgw5YkSVJBhi1JkqSCDFuSJEkFGbYkSZIKMmxJkiQVZNiSJEkqyLAlSZJUkGFLkiSpIMOWJElSQYYtSZKkggxbkiRJBRm2JEmSCjJsSZIkFWTYkiRJKsiwJUmSVJBhS5IkqSDDliRJUkGGLUmSpIIMW5IkSQUZtiRJkgoybEmSJBVk2JIkSSrIsCVJklSQYUuSJKkgw5YkSVJBhi1JkqSCDFuSJEkFGbYkSZIKMmxJkiQVZNiSJEkqyLAlSZJUkGFLkiSpIMOWJElSQYYtSZKkggxbkiRJBRm2JEmSCjJsSZIkFWTYkiRJKsiwJUmSVJBhS5IkqSDDliRJUkGGLUmSpIIMW5IkSQUZtiRJkgoybEmSJBVk2JIkSSrIsCVJklSQYUuSJKkgw5YkSVJBhi1JkqSCDFuSJEkFGbYkSZIKMmxJkiQVZNiSJEkqyLAlSZJUkGFLkiSpIMOWJElSQYYtSZKkggxbkiRJBRm2JEmSCjJsSZIkFWTYkiRJKsiwJUmSVJBhS5IkqSDDliRJUkGGLUmSpIIMW5IkSQUZtiRJkgoybEmSJBVk2JIkSSrIsCVJklSQYUuSJKkgw5YkSVJBhi1JkqSCDFuSJEkFGbYkSZIKMmxJkiQVZNiSJEkqyLAlSZJUkGFLkiSpIMOWJElSQYYtSZKkggxbkiRJBRm2JEmSCpo0bEXEQET8x3RMRpIkqd9MGrYycxvwUEQcMA3zkSRJ6iv7dLnefwEbIuIK4MFGY2a+qcisJEmS+kS3Yesz9U2SJEm7oKuwlZkXRcQQcFhmbiw8J0mSpL7R1dWIEfFy4Ebg3+vHx0TE5SUnJkmS1A+6/eiHc4HnAFsBMvNG4PBCc5IkSeob3YatRzPzZy1tOdWTkSRJ6jfdniB/c0T8BjAQEYuBNwFfLTctSZKk/tDtnq0/Bo4CHgYuBu4D3lxqUpIkSf2i26sRHwLeUd8kSZLUpQnDVkR8mgnOzcrMV0z5jCRJkvrIZHu2/qr+eRLwc8DH6senAJsm2jAiPgT8GnB3Zv7iHsxxSiw6q/pM1jOXPMrpZ82uz2edt+8ADz6ybY/7GR4aZPv27dz38M59zR2cw36DA2x9aIyFw0OsWnEkK48dGV9+zpoNHPLgz3ao/bIjFvDxM44HYM0Nm/mTy27ikW2dr7toXr/Zmhs2s3rtRjZvHd2hfd+B4N0nH73DPNppbH/H1tG2c58OuzOHXsx7sjFPvfAarr51y/jj1tdsJtR6qvXqdXjXp7/FvQ+NAdXv5rmvOGqvr6W0p2bq35gJz9nKzKsy8yrg2Mx8TWZ+ur79BvC8Sfr+J+DEKZrnHlk0y8JVq6kIWgBbR8faBi2Ah8a2c+9DYySweesoZ39qA2tu2AxUQetj1/5op22uvnULp154DWtu2MxbLr1xwqDVvH6zNTds5uxPbdgpaAE8si15y6U3js+jnebt2819OuzOHHox78nGbA1asONrNhNqPdV69Tqsuuym8aAF1e/mqk/ctFfXUtpTM/lvTLcnyB8UEU9tPIiIw4GDJtogM78EbJloHfWv0bFtrF5bfdnAxetv77je1bduYfXajV1/jkjrP+ar125kdKxzmMx6nU7abd889+mwO3PoxbwnG7P1tWlotM+EWk+1Xr0OY23+YzK2PffqWkp7aib/jYnMyf+Zi4gTgX8AbqubFgG/l5lrJ9luEfBvEx1GjIg3AG8AOOSQQ559ySWXdDPvXbJh82MfEXbIENy1804QFbJk5IDx+k9V7ZeMHDB+v/m17XabZhNt32mbqbY7c9iVbR544AH233//3ZvcLoy5p8v3RpM9p6mqfbdjNsad7UrUXd3pZe178TfmhBNOuD4zl062XldhCyAi9gOeXj/8bmY+3MU2i5gkbDVbunRpXnfddV3NZ1c0H0Y8c8mjvGdDtx8vpj0xMjzE1We9kCPO/izbMjvWfmR4qO1hwE42XfCy8fvLLrhy0m0b82in0/YTbTPVdmcOu7LNunXrWL58efF5TnS4ftMFL5sRtZ5qkz2nqap9N2M2jzvblai7utPL2vfib0xEdBW2uj2MCPBsqs/aOhp4TUS8fncnp/43NDjAqhVHAnDKcw/tuN6yIxawasWRRJf9LjtiwQ6PV604kqHBgY7rR71OJ+22b577dNidOfRi3pON2fraNDTaZ0Ktp1qvXofBgZ1/YwbnxF5dS2lPzeS/MV3t4omIjwJHUH0ZdeOAaAIfKTSvKbXpgpfN6pPke3014nkrl1QrPXjbDtu0Xqm2O1cjNsbY3asRm7fv1dUruzOHXsx7sjE/fsbxE16NOBNqPdV6+Tp4NaK0o5n8N6bbc7a+Azwjuz3mWG1zMbAcOBC4C3hnZv7jRNuUOozYzN3LvWPte8O694617w3r3juzrfbdHkbs+rsRqT5n685uJ5CZp3S7riRJUr/qNmwdCHw7Ir5G9f2IgJ8gL0mSNJluw9a5JSchSZLUr7r9IuqrIuIpwOLM/I+ImAt0vgRMkiRJQJcf/RARZwCXAR+sm0aANaUmJUmS1C+6/ZytPwKWAfcBZOb3gYNLTUqSJKlfdBu2Hs7MRxoPImIf6Prr7CRJkmatbsPWVRHxdmAoIl4EfAL4dLlpSZIk9Yduw9ZZwD3ABqovjf5MZr6j2KwkSZL6xIRhKyJeGRF/lJnbM/NC4CnAUuDtEXHytMxQkiRpLzbZnq0/AS5verwv1RdSLwf+oNCcJEmS+sZkn7O1b2be3vT4K5m5BdgSEfMKzkuSJKkvTLZna37zg8x8Y9PDg6Z+OpIkSf1lsrC1vv5A0x1ExO8BXyszJUmSpP4x2WHEtwBrIuI3gG/Ubc8G9gNWlpyYJElSP5gwbGXm3cB/i4gXAkfVzZ/JzCuLz0ySJKkPdPtF1FcCBixJkqRd1O2HmkqSJGk3GLYkSZIKMmxJkiQVZNiSJEkqyLAlSZJUkGFLkiSpIMOWJElSQYYtSZKkggxbkiRJBRm2JEmSCjJsSZIkFWTYkiRJKsiwJUmSVJBhS5IkqSDDliRJUkGGLUmSpIIMW5IkSQUZtiRJkgoybEmSJBVk2JIkSSrIsCVJklSQYUuSJKkgw5YkSVJBhi1JkqSCDFuSJEkFGbYkSZIKMmxJkiQVZNiSJEkqyLAlSZJUkGFLkiSpIMOWJElSQYYtSZKkggxbkiRJBRm2JEmSCjJsSZIkFWTYkiRJKsiwJUmSVJBhS5IkqSDDliRJUkGGLUmSpIIMW5IkSQUZtiRJkgoybEmSJBVk2JIkSSrIsCVJklSQYUuSJKkgw5YkSVJBhi1JkqSCDFuSJEkFGbYkSZIKMmxJkiQVZNiSJEkqyLAlSZJUkGFLkiSpIMOWJElSQYYtSZKkggxbkiRJBRm2JEmSCjJsSZIkFWTYkiRJKsiwJUmSVJBhS5IkqSDDliRJUkGGLUmSpIIMW5IkSQUZtiRJkgoybEmSJBVk2JIkSSrIsCVJklSQYUuSJKkgw5YkSVJBhi1JkqSCDFuSJEkFGbYkSZIKMmxJkiQVZNiSJEkqyLAlSZJUkGFLkiSpIMOWJElSQYYtSZKkggxbkiRJBRm2JEmSCjJsSZIkFWTYkiRJKsiwJUmSVJBhS5IkqSDDliRJUkGGLUmSpIIMW5IkSQUZtiRJkgoybEmSJBVk2JIkSSrIsCVJklSQYUuSJKkgw5YkSVJBhi1JkqSCDFuSJEkFFQ1bEXFiRGyMiFsi4qySY0mSJM1E+5TqOCIGgL8DXgT8GPh6RFyemd8uNeZEFp31GQDOXPIop9f393YDEWzL3ON+5gDbm/pr/BwZHmLViiNZeezI+LprbtjM6rUbuWPrKAcMDRIB9z40xpyA7fVUhocGOfcVR+2wXavmfobnDpIJW0fHdlhn2REL+PgZx48/ftF71/H9ux8cf7zPnGDb9mRhyzyb+25d1i/OWbOBi9ffPv56nfLcQzlv5ZJpncOpF17D1bduGX/c+npJkiol92w9B7glM2/LzEeAS4BXFhyvo0V9Eq5aTUXQgipoNffX+Ll56yhnf2oDa27YDFQh5uxPbWDz1lGSKhzd+1AVkLY3TWXr6BirPnHT+HatWvu596GxnYIWwNW3buHUC68Bdg5aAI9uT7Jlnq19tz6HfnDOmg187Nof7fB6fezaH3HOmg3TNofWoAU7vl6SpMeUDFsjwO1Nj39ct2kvMjq2jdVrNwKweu1GRse2dbXd2PYc367VrvTT+Ae9NWh1mme7vpufQz+4eP3tu9ReQmvQmqxdkmazyCnaO7JTxxGvAlZk5u/Wj18HPCcz/7hlvTcAbwA45JBDnn3JJZdM+Vw2bP7Z+P1DhuCu0Skfou8tGTlghzruynYNDzzwAPvvv/8u97O7Y080l73ZRLVo9xwbde/lHGarErXX5Kx778y22p9wwgnXZ+bSydYrGbaOB87NzBX147MBMvP8TtssXbo0r7vuuimfS/NhxDOXPMp7NhQ7Va0vjQwPcfVZL2TZBVeyeWv3SbWxXcO6detYvnz5Lvez6YKXdXUoeGR4CKBt361z2ZsdcfZn2x5CHojg1vNfulN7o+5TaaLXY9MFL5vSsfZmJWqvyVn33plttY+IrsJWycOIXwcWR8ThEbEv8Frg8oLjqYChwQFWrTgSgFUrjmRocKCr7QbnxPh2rXaln2VHLABg8cHzuppnu76bn0M/OOW5h+5SewmN16XbdkmazYqFrcx8FHgjsBb4DvAvmfmtUuNNpF//pz0QMSX9NN4Ejf4aP0eGhzj/pCXjV/KtPHaE809awsjwEEF11eH8uYNVH01TGR4aZPWrju54BWBrP/PnDjI8NLjTes1Xt13x1uU7Ba595gTRMs/WvlufQz84b+USTjvusB1er9OOO2xar0b8+BnH7xSsvBpRktorejwtMz8LfLbkGN1qBK5169ax6dTlvZ3MXqwRaHrRzxVvXV6s773NeSuXTPtHPbQyWElSd/wEeUmSpIIMW5IkSQUZtiRJkgoybEmSJBVk2JIkSSrIsCVJklSQYUuSJKkgw5YkSVJBxb4bcXdExD3ADwsPcyDwk8JjqD1r3xvWvXesfW9Y996ZbbV/SmYeNNlKMypsTYeIuK6bL43U1LP2vWHde8fa94Z17x1r356HESVJkgoybEmSJBU0G8PWP/R6ArOYte8N69471r43rHvvWPs2Zt05W5IkSdNpNu7ZkiRJmjaGLUmSpIJmTdiKiBMjYmNE3BIRZ/V6PnubiNgUERsi4saIuK5uWxARV0TE9+uf8+v2iIj31bX+ZkQ8q6mf36zX/35E/GZT+7Pr/m+pt42JxuhXEfGhiLg7Im5uautZnScao990qP25EbG5ft/fGBEvbVp2dl2XjRGxoqm97d+aiDg8ItbXNb40Ivat2/erH99SL1802Rj9JCIOjYgvRsR3IuJbEfE/6nbf94VNUHvf91MtM/v+BgwAtwJPBfYFbgKe0et57U03YBNwYEvbu4Gz6vtnAX9Z338p8DkggOOA9XX7AuC2+uf8+v78etnXgOPrbT4HvGSiMfr1BrwAeBZw80yoc6cx+vHWofbnAm9rs+4z6r8j+wGH139fBib6WwP8C/Da+v4HgD+o7/8h8IH6/muBSycao9d1KlD3JwHPqu8/Hvhe/dx93/eu9r7vp/g2W/ZsPQe4JTNvy8xHgEuAV/Z4Tv3glcBF9f2LgJVN7R/JyrXAcEQ8CVgBXJGZWzLzXuAK4MR62RMy85qsfts+0tJXuzH6UmZ+CdjS0tzLOncao+90qH0nrwQuycyHM/MHwC1Uf2fa/q2p96S8ELis3r61xo3aXwb8Sr1+pzH6SmbemZnfqO/fD3wHGMH3fXET1L4T3/e7abaErRHg9qbHP2biN5R2lsDnI+L6iHhD3XZIZt4J1S8tcHDd3qneE7X/uE37RGPMJr2ss7878Mb6UNKH4rHD2Lta+ycCWzPz0Zb2Hfqql/+sXn/W1b4+lHQssB7f99Oqpfbg+35KzZawFW3a/MyLXbMsM58FvAT4o4h4wQTrdqr3rrZrYtNR59n+2vwf4AjgGOBO4D11+1TW3t8LICL2Bz4JvDkz75to1TZtvu/3QJva+76fYrMlbP0YOLTp8ZOBO3o0l71SZt5R/7wb+Feq3bp3NXat1z/vrlfvVO+J2p/cpp0JxphNelnnWf27k5l3Zea2zNwOXMhjhzN2tfY/oToUtU9L+w591csPoDqcOWtqHxGDVP/YfzwzP1U3+76fBu1q7/t+6s2WsPV1YHF9VcS+VCfjXd7jOe01ImJeRDy+cR94MXAzVQ0bV/z8JvD/6vuXA6+vr+g5DvhZvYt+LfDiiJhf75Z+MbC2XnZ/RBxXH7N/fUtf7caYTXpZ505jzAot5+n8d6r3PVR1eW19RdXhwGKqk7Db/q2pzxX6InByvX1rjRu1Pxm4sl6/0xh9pX4v/iPwncx8b9Mi3/eFdaq97/sCpvJs+5l8o7q65HtUVza8o9fz2ZtuVFeY3FTfvtWoH9Xx9S8A369/LqjbA/i7utYbgKVNff021QmPtwC/1dS+lOoX+lbg/Tz27QZtx+jXG3Ax1W77Mar/4f1OL+s80Rj9dutQ+4/Wz/ubVP8IPKlp/XfUddlIfXVb3d72b039e/S1+jX5BLBf3f64+vEt9fKnTjZGP92A51EdJvomcGN9e6nv+57W3vf9FN/8uh5JkqSCZsthREmSpJ4wbEmSJBVk2JIkSSrIsCVJklSQYUvStIiI4Yj4w4L9vzki5pbqv8s5nB4RC3s5B0kzj2FL0nQZpvry2VLeDPQ0bAGnA4YtSTswbEmaLhcAR0TEjRHx4Yh4BUBE/GtEfKi+/zsRcV59/7SI+Fq9/ge9vKG3AAACkUlEQVQjYqBuf3FEXBMR34iIT0TE/hHxJqqQ88WI+GKnCUTEifV2N0XEF+q2BRGxpv4euGsj4pl1+7kR8bambW+OiEX17TsRcWFEfCsiPh8RQxFxMtXnOX28nvNQkSpK2usYtiRNl7OAWzPzGKpP+35+3T4CPKO+/zzgyxHxC8BrqL6T8xhgG3BqRBwInAP8albf1Xkd8NbMfB/VV3qckJkntBs8Ig6i+uqRX8/Mo4FX1YveBdyQmc8E3g58pIvnshj4u8w8Ctha93lZPZ9TM/OYzBztriyS+t0+k68iSVPuy8CbI+IZwLeB+fVXhBwPvInqazyeDXy9+kYRhqi+t+44qmB2dd2+L3BNl2MeB3wpM38AkJlb6vbnAb9et10ZEU+MiAMm6esHmXljff96YFGXc5A0Cxm2JE27zNxcf3/dicCXgAXAq4EHMvP++jvbLsrMs5u3i4iXA1dk5im7MWxQfTVJu/adpgg8yo57/x/XdP/hpvvbqMKgJLXlYURJ0+V+4PFNj6+hOqn9S1R7ut5W/4Tqe+pOjoiDYfy8qqcA1wLLIuJpdfvciPj5Dv23ugb45frLbYmIBXX7l4BT67blwE8y8z5gE/Csuv1ZwOG78RwlybAlaXpk5k+pDv/dHBGrqYLVPpl5C/ANqr1bX67X/TbVuVmfj4hvAldQfRnuPVRX/F1ct18LPL0e4h+Az3U6Qb7e9g3ApyLiJuDSetG5wNK6vwuoDmECfBJYEBE3An9A9SW7k/kn4AOeIC+pmV9ELUmSVJB7tiRJkgryBHlJfSci1gP7tTS/LjM39GI+kmY3DyNKkiQV5GFESZKkggxbkiRJBRm2JEmSCjJsSZIkFWTYkiRJKsiwJUmSVND/B/3vaTa0ASSSAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 720x360 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#tweet_count发布tweet数量\n",
    "drawScatter(df['tweet_count'],y,'tweet_count')\n",
    "#tweet_count特征，可以重点参考"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlsAAAFOCAYAAACxNtjEAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAHdFJREFUeJzt3Xm4XXV97/H3p8FQBGQ2VwYJDq3FoSipE4rBEUWcHosodaitQ21LsfhQFG9BW65Dq+3V9l5nC1ZBRfGiWBWHMCnBgEBQqzJJQAQxMqpQ8Hv/WOuw9zlNztkZfjknZ79fz7Ofvddvr73Wb32Tc/LJ77f2WqkqJEmS1MZvzXYHJEmS5jPDliRJUkOGLUmSpIYMW5IkSQ0ZtiRJkhoybEmSJDVk2JIkSWrIsCVpZEmWJfnTTbzPw5J8peH275/ktiQL1vL+cUn+vdX+Z1OSxUkqyRaz3RdpPjNsSXNIkquSPHUT73POhIk1/eNfVR+vqqe32mdVXV1V21TV3evyuSS7JbkryQPX8N6pSf6xf/3cJBcluSXJjUm+lmTxNNtdkuQLSX6R5KYk30tyfJId1vXYJM0Nhi1JWg9VdS3wNeClw+1JdgSeBZyQ5EHAicCRwHbAXsD/AX6zpm0meTywDDgXeEhVbQ8cCNwF/H6TA1lPjoZJozNsSXNEko8B9wc+309rHZXkhCRH9u/v1o/6vK5fflCS1UnSLz+7H0G5Kck3kzxiaNu7JvlMkp8luTLJ4X37gcCbgBf1+7x4Hfr7W0nenOTHSW5IcmKS7Ybef0Lfj5uSrEryir79oCTf6Ud6ViU5bmizZ/XPN/X9eVySVyQ5Z2i7j0/y7SQ398+PH3pvWZK/S3JukluTfCXJzjMcx6TRtCR7JTmz//wZwHSfP4EpYQs4FPhuVa0E9gGurKqvVefWqvpMVV29lu29E/hoVb2tqq6He0bejq2qZUN9fmWS7/ejX19OsufQe5XktUl+1L//r0N/RxYk+cd+hO0K4KAptdguyYeTXJfk2iR/PzG92v85nJvkn5KsBo6brq6SBgxb0hxRVS8FrgYO7qe13gmcCSztV3kScEX/DLA/cHZVVZJHAR8BXgPsBLwfOC3Jlkl+C/g8cDGwG/AU4Igkz6iqLwH/C/hkv891GT15Rf84AHgAsA3wL9CdBwX8B/BeYBe60HFR/7nbgZcB29P9Y/9nSZ43dEwA2/f9+dbwDvtRo9OB9/TH+W7g9CQ7Da32EuCPgfsCC4E3rMMxAXwCuIAuZP0d8PJp1j0V2DnJE4baXko3mgVwIfCQPqAckGSbtW0oydbA44DPTNe5vlZvAl5AV9uzgZOmrPZs4A/oRsMOAZ7Rt7+qf++RwBLghVM+dwLdKNqD+nWeDgyfo/cYur+D9wWOn66fkgYMW9LcdibwxD4w7U838rFf/96T+veh+0f0/VW1vKrurqoTgDuAx9L9o7tLVb21qu6sqiuAD9KNwGyIw4B3V9UVVXUb8Ebg0H6E6DDgq1V1UlX9V1X9vKouAqiqZVW1sqp+U1WX0AWFJ611L5MdBPyoqj5WVXdV1UnAfwIHD63z0ar6YVX9CvgUXdAbSR8S/wD4n1V1R1WdRRdU16jfx6fpwiNJHgzsSxfY6Gu9lC7kfgq4Mcm/rSV07UD3O/mnQ/15Zz8yeHuSN/fNrwHeVlXfr6q76MLyPsOjW8Dbq+qmfgTtG0M1OAT456paVVWrgbcN7WsR8EzgiKq6vapuAP6JyX9PflJV7+1r/6u11UXSZIYtaQ6rqsuB2+j+sXwi8AXgJ0l+l8lha0/gyP4f5puS3ATsAezav7frlPfeBCzawO7tCvx4aPnHwBb9dvcALl/Th5I8Jsk3+inNm4HXMv1U3XT7nNjvbkPLPx16/Uu6EbdR7Qr8oqpun7L96ZwAHJLkt+lGtb7UBxUAquq8qjqkqnah+zPcHzhmDdv5Bd25XPcb+uxR/Xlbp9LVFro/z/899Ge5Ggij1WBXYNVajm1P4F7AdUPbfj/dKNaE4c9KGpFhS5pbag1tZ9JN9yzsT8o+k24kZQcGU3OrgOOravuhx737kZ9VdOcNDb+3bVU9a5p9juIndP9AT7g/3RTU9f0+/9u39HqfAE4D9qiq7YD30YWFUfoydZ8T+7129G5P6zpgh35Kb3j7a1VVZwM/B54L/BGDKcQ1rftt4LPAw9bw3u3AcrrpwemsAl4z5c9zq6r65gyfg+749hhaHj62VXSjoTsPbfc+VfXQ4W6OsA9JUxi2pLnlerrzn4adCfwFg5PHlwF/CZwzdLmCDwKv7UeNkmTr/kT0bYHzgVuS/E2SrfqTpB+W5A+G9rm4n6pcFycBr+9PKN+GwblfdwEfB56a5JAkWyTZKcnEVNa2wOqq+nWSR9OdYzXhZ3SjO1NrMOGLwO8keUm/3RcBe9ON+G2wqvoxsAJ4S5KF/blYB8/wMegC1jvozkO7Z9ox3ZcEXpXkvv3yQ4DnAOetZTtHAa9McvTQZ3an+xbjhPcBb0zy0P797ZL84YiH+Cng8CS7p7uUxNETb1TVdcBXgHcluU+6L0A8MMmoU7yS1sKwJc0tbwPe3E/jTJzYfSZdQJkIW+cA9x5apqpW0J239S9001GX0Z28Th/IDqb/ZhxwI/AhuksRQHfOEcDPk1y4Dn39CPCxvh9XAr+mC4H05wo9i+6SB6vpRuAmTr5/HfDWJLcCf0sXACaO45d0J16f29fgscM7rKqf053gfSTdaNJRwLOr6sZ16PdMXkJ3Ivhq4FimGakaciLdKNEnq+qOofab6MLVyiS3AV+imxJ855o2UlXnAE+mm2r8YT+V9yW6gP3efp1T6YLdyUluAS6lO9dqFB8Evkz3ZYkL6UbZhr2M7ksF36P7e3QKQ9OaktZPqhwVliRJasWRLUmSpIYMW5ImSXcx0TU9njjbfVtf6e6vuKZj+u5s903S/Oc0oiRJUkOObEmSJDU0p24kuvPOO9fixYub7uP2229n6623nnnFMWAtJrMeA9ZiwFoMWIvJrMfAuNbiggsuuLG/YPG05lTYWrx4MStWrGi6j2XLlrF06dKm+9hcWIvJrMeAtRiwFgPWYjLrMTCutUgy0x0mAKcRJUmSmjJsSZIkNWTYkiRJasiwJUmS1JBhS5IkqSHDliRJUkOGLUmSpIYMW5IkSQ0ZtiRJkhqaUzeiXrJkSbW6gvybP7eSTyy/mtc/7C7etXJOXTh/1hz5cGsxzHoMWIsBazFgLSazHgNztRb7PXBHPv6qxzXbfpILqmrJTOuNxcjWmz+3kn8/72p+M3dypSRJauzcy1dz2Ae/NdvdGI+wddLyVbPdBUmSNAvOvXz1bHdhPMLW3XNoqlSSJI2XsQhbC5LZ7oIkSRpTYxG2XvyYPWa7C5IkaRbs98AdZ7sLzL2vDjTw9897OACfWH71LPdEkiRtKq2/jTiqsbn0w4Rly5axdOnSpvvYXFiLyazHgLUYsBYD1mIy6zEwrrXw0g+SJElzgGFLkiSpIcOWJElSQ4YtSZKkhgxbkiRJDRm2JEmSGjJsSZIkNWTYkiRJasiwJUmS1JBhS5IkqSHDliRJUkOGLUmSpIYMW5IkSQ0ZtiRJkhoybEmSJDVk2JIkSWrIsCVJktSQYUuSJKkhw5YkSVJDhi1JkqSGDFuSJEkNGbYkSZIaMmxJkiQ1ZNiSJElqyLAlSZLUkGFLkiSpIcOWJElSQ4YtSZKkhgxbkiRJDRm2JEmSGjJsSZIkNWTYkiRJasiwJUmS1JBhS5IkqSHDliRJUkOGLUmSpIYMW5IkSQ0ZtiRJkhoybEmSJDVk2JIkSWrIsCVJktSQYUuSJKkhw5YkSVJDhi1JkqSGDFuSJEkNGbYkSZIaMmxJkiQ1ZNiSJElqyLAlSZLUkGFLkiSpIcOWJElSQ4YtSZKkhgxbkiRJDRm2JEmSGjJsSZIkNWTYkiRJasiwJUmS1JBhS5IkqSHDliRJUkOGLUmSpIYMW5IkSQ0ZtiRJkhoybEmSJDVk2JIkSWrIsCVJktSQYUuSJKkhw5YkSVJDhi1JkqSGDFuSJEkNGbYkSZIaMmxJkiQ1ZNiSJElqyLAlSZLUkGFLkiSpIcOWJElSQ4YtSZKkhgxbkiRJDRm2JEmSGjJsSZIkNWTYkiRJasiwJUmS1JBhS5IkqSHDliRJUkOGLUmSpIYMW5IkSQ0ZtiRJkhoybEmSJDVk2JIkSWrIsCVJktSQYUuSJKkhw5YkSVJDhi1JkqSGDFuSJEkNGbYkSZIaMmxJkiQ1ZNiSJElqyLAlSZLUkGFLkiSpIcOWJElSQ4YtSZKkhgxbkiRJDc0YtpIsSPLVTdEZSZKk+WbGsFVVdwO/TLLdJuiPJEnSvLLFiOv9GliZ5Azg9onGqjq8Sa8kSZLmiVHD1un9Q5IkSetgpLBVVSck2Qq4f1X9oHGfJEmS5o2Rvo2Y5GDgIuBL/fI+SU5r2TFJkqT5YNRLPxwHPBq4CaCqLgL2atQnSZKkeWPUsHVXVd08pa02dmckSZLmm1FPkL80yUuABUkeDBwOfLNdtyRJkuaHUUe2/hJ4KHAHcBJwC3BEq05JkiTNF6N+G/GXwDH9Q5IkSSOaNmwl+TzTnJtVVc/Z6D2SJEmaR2Ya2frH/vkFwP8A/r1ffjFw1XQfTPIR4NnADVX1sA3ooyRJ0mYrVTN/qTDJWVW1/0xtU97fH7gNOHHUsLVkyZJasWLFKKuus0cc+yVuueNujnz4Xbxr5ajfC5jfrMVk1mPAWgxYiwFrMZn1GJirtVi07UKWH/O0ZttPckFVLZlpvVFPkN8lyQOGNr4XsMt0H6iqs4DVI26/qYmgJUmSxsf1t97JY44/Y7a7MfKlH14PLEtyRb+8GHhNkx41YNCSJGk8XX/rnbPdhdGmEQGSbAk8pF/8z6q6Y4TPLAa+MN00YpJXA68GWLRo0b4nn3zySP1ZFyuvHVyPddFWcP2vNvouNkvWYjLrMWAtBqzFgLWYzHoMzPVaPHy37Zps94ADDhhpGnFdwtbj6Ua07hkNq6oTZ/jMYmYIW8NanbO1+OjT73k9V+eVZ4O1mMx6DFiLAWsxYC0msx4Dc70WV739oCbbHfWcrZEqk+RjwAPpbkY9MSdXwLRha664z5YLnEqUJGkMLdp24Wx3YeRztpYAe9eow2BAkpOApcDOSa4Bjq2qD697FzfcJW850JPkJUkaM62/jTiqke+NSHedretG3XBVvXi9etTIJW85EIBly5Zx1WFLZ7czc4S1mMx6DFiLAWsxYC0msx4D1mJ6o4atnYHvJTmf7v6IgFeQlyRJmsmoYeu4lp2QJEmar0a9EfWZSfYEHlxVX01yb2BB265JkiRt/ka6gnySVwGnAO/vm3YDPteqU5IkSfPFqLfr+XNgP+AWgKr6EXDfVp2SJEmaL0YNW3dU1T3Xu0+yBd11tiRJkjSNUcPWmUneBGyV5GnAp4HPt+uWJEnS/DBq2Doa+Bmwku4+hqdX1THNeiVJkjRPTBu2kjw3yZ9X1W+q6oPAnnRXk39Tkhdukh5KkiRtxmYa2ToKOG1oeSGwL91teP6sUZ8kSZLmjZmus7WwqlYNLZ9TVauB1Um2btgvSZKkeWGmka0dhheq6i+GFnfZ+N2RJEmaX2YKW8v7C5pOkuQ1wPltuiRJkjR/zDSN+Hrgc0leAlzYt+0LbAk8r2XHJEmS5oNpw1ZV3QA8PsmTgYf2zadX1deb90ySJGkeGPVG1F8HDFiSJEnraNSLmkqSJGk9GLYkSZIaMmxJkiQ1ZNiSJElqyLAlSZLUkGFLkiSpIcOWJElSQ4YtSZKkhgxbkiRJDRm2JEmSGjJsSZIkNWTYkiRJasiwJUmS1JBhS5IkqSHDliRJUkOGLUmSpIYMW5IkSQ0ZtiRJkhoybEmSJDVk2JIkSWrIsCVJktSQYUuSJKkhw5YkSVJDhi1JkqSGDFuSJEkNGbYkSZIaMmxJkiQ1ZNiSJElqyLAlSZLUkGFLkiSpIcOWJElSQ4YtSZKkhgxbkiRJDRm2JEmSGjJsSZIkNWTYkiRJasiwJUmS1JBhS5IkqSHDliRJUkOGLUmSpIYMW5IkSQ0ZtiRJkhoybEmSJDVk2JIkSWrIsCVJktSQYUuSJKkhw5YkSVJDhi1JkqSGDFuSJEkNGbYkSZIaMmxJkiQ1ZNiSJElqyLAlSZLUkGFLkiSpIcOWJElSQ4YtSZKkhgxbkiRJDRm2JEmSGjJsSZIkNWTYkiRJasiwJUmS1JBhS5IkqSHDliRJUkOGLUmSpIYMW5IkSQ0ZtiRJkhoybEmSJDVk2JIkSWrIsCVJktSQYUuSJKkhw5YkSVJDhi1JkqSGDFuSJEkNGbYkSZIaMmxJkiQ1ZNiSJElqyLAlSZLUkGFLkiSpIcOWJElSQ4YtSZKkhgxbkiRJDRm2JEmSGjJsSZIkNWTYkiRJasiwJUmS1JBhS5IkqSHDliRJUkOGLUmSpIYMW5IkSQ0ZtiRJkhoybEmSJDVk2JIkSWrIsCVJktSQYUuSJKkhw5YkSVJDhi1JkqSGDFuSJEkNNQ1bSQ5M8oMklyU5uuW+JEmS5qJUVZsNJwuAHwJPA64Bvg28uKq+t7bPLFmypFasWNGkP3sdfToFHPnwu3jXyi2a7GNzYy0msx4D1mLAWgxYi8msx8BcrcWibRey/JinNdt+kguqaslM67Uc2Xo0cFlVXVFVdwInA89tuL+1mghakiRpfFx/65085vgzZrsbTcPWbsCqoeVr+rZNzqAlSdJ4uv7WO2e7C02nEf8QeEZV/Wm//FLg0VX1l1PWezXwaoBFixbte/LJJ2/0vqy89uZ7Xi/aCq7/1UbfxWbJWkxmPQasxYC1GLAWk1mPgblei4fvtl2T7R5wwAEjTSO2nGC9BthjaHl34CdTV6qqDwAfgO6craVLl270jrzi6NPveT1X55Vng7WYzHoMWIsBazFgLSazHgNzvRZXHbZ0Vvffchrx28CDk+yVZCFwKHBaw/2tVWZjp5IkadYt2nbhbHehXdiqqruAvwC+DHwf+FRVfbfV/qZz5dsPMnBJkjRmWn8bcVRNx/yq6ovAF1vuY1RXvv0gAJYtWzbrw4lzhbWYzHoMWIsBazFgLSazHgPWYnpeQV6SJKkhw5YkSVJDhi1JkqSGDFuSJEkNGbYkSZIaMmxJkiQ1ZNiSJElqyLAlSZLUkGFLkiSpoVTVbPfhHkl+Bvy48W52Bm5svI/NhbWYzHoMWIsBazFgLSazHgPjWos9q2qXmVaaU2FrU0iyoqqWzHY/5gJrMZn1GLAWA9ZiwFpMZj0GrMX0nEaUJElqyLAlSZLU0DiGrQ/MdgfmEGsxmfUYsBYD1mLAWkxmPQasxTTG7pwtSZKkTWkcR7YkSZI2mbEJW0kOTPKDJJclOXq2+9NKko8kuSHJpUNtOyY5I8mP+ucd+vYkeU9fk0uSPGroMy/v1/9RkpfPxrFsqCR7JPlGku8n+W6Sv+rbx64eSX47yflJLu5r8Za+fa8ky/vj+mSShX37lv3yZf37i4e29ca+/QdJnjE7R7ThkixI8p0kX+iXx7kWVyVZmeSiJCv6trH7OQFIsn2SU5L8Z/+743HjWIskv9v/fZh43JLkiHGsxUZRVfP+ASwALgceACwELgb2nu1+NTrW/YFHAZcOtb0TOLp/fTTwjv71s4D/AAI8Fljet+8IXNE/79C/3mG2j209anE/4FH9622BHwJ7j2M9+mPapn99L2B5f4yfAg7t298H/Fn/+nXA+/rXhwKf7F/v3f/8bAns1f9cLZjt41vPmvw18AngC/3yONfiKmDnKW1j93PSH8cJwJ/2rxcC249rLYZqsgD4KbDnuNdifR/jMrL1aOCyqrqiqu4ETgaeO8t9aqKqzgJWT2l+Lt0vEPrn5w21n1id84Dtk9wPeAZwRlWtrqpfAGcAB7bv/cZVVddV1YX961uB7wO7MYb16I/ptn7xXv2jgCcDp/TtU2sxUaNTgKckSd9+clXdUVVXApfR/XxtVpLsDhwEfKhfDmNai2mM3c9JkvvQ/Yf1wwBVdWdV3cQY1mKKpwCXV9WPsRbrZVzC1m7AqqHla/q2cbGoqq6DLoAA9+3b11aXeVevfurnkXQjOmNZj37a7CLgBrpfeJcDN1XVXf0qw8d1zzH3798M7MQ8qQXwz8BRwG/65Z0Y31pAF7y/kuSCJK/u28bx5+QBwM+Aj/ZTzB9KsjXjWYthhwIn9a/HvRbrZVzCVtbQ5tcw116XeVWvJNsAnwGOqKpbplt1DW3zph5VdXdV7QPsTjcC83trWq1/nre1SPJs4IaqumC4eQ2rzvtaDNmvqh4FPBP48yT7T7PufK7HFnSnYfzfqnokcDvdVNnazOdaANCfu/gc4NMzrbqGtnlViw0xLmHrGmCPoeXdgZ/MUl9mw/X9cC798w19+9rqMm/qleRedEHr41X12b55bOsB0E+LLKM7r2L7JFv0bw0f1z3H3L+/Hd309HyoxX7Ac5JcRXdKwZPpRrrGsRYAVNVP+ucbgFPpwvg4/pxcA1xTVcv75VPowtc41mLCM4ELq+r6fnmca7HexiVsfRt4cP9to4V0Q6KnzXKfNqXTgIlvgLwc+H9D7S/rv0XyWODmflj4y8DTk+zQf9Pk6X3bZqU/r+bDwPer6t1Db41dPZLskmT7/vVWwFPpzmH7BvDCfrWptZio0QuBr1dV9e2HpvuG3l7Ag4HzN81RbBxV9caq2r2qFtP9Lvh6VR3GGNYCIMnWSbadeE339/tSxvDnpKp+CqxK8rt901OA7zGGtRjyYgZTiDDetVh/s32G/qZ60H1T4od056kcM9v9aXicJwHXAf9F9z+KP6E7v+RrwI/65x37dQP8a1+TlcCSoe28ku6E38uAP57t41rPWjyBbrj6EuCi/vGscawH8AjgO30tLgX+tm9/AF1AuIxummDLvv23++XL+vcfMLStY/oa/QB45mwf2wbWZSmDbyOOZS364764f3x34vfjOP6c9MewD7Ci/1n5HN036Ma1FvcGfg5sN9Q2lrXY0IdXkJckSWpoXKYRJUmSZoVhS5IkqSHDliRJUkOGLUmSpIYMW5IkSQ0ZtiRJkhoybElaZ0m2T/K6hts/Ism9Z1jnqiQ7b8R9Lk3y+KHl1yZ52Ubc/pIk71nLexv1WCTNLYYtSetje6BZ2AKOoLug4qa0FLgnbFXV+6rqxI218apaUVWHb6ztSdp8GLYkrY+3Aw9MclGSjyZ5DkCSU5N8pH/9J0n+vn/9R0nO79d/f5IFffvTk3wryYVJPp1kmySHA7sC30jyjVE6k+Svk1zaP44Yan9ZkkuSXJzkY33bwUmWJ/lOkq8mWZRkMfBa4PV9H5+Y5Lgkb+g/s0+S8/ptndrfdoQky5K8oz+2HyZ54jR9XJrkC/3rnZJ8pe/D+1nzzXolzROGLUnr42jg8qrah+4+ZxMhYzdg7/71E4Czk/we8CJgv379u4HD+mmzNwNPrapH0d0i5a+r6j10N6o9oKoOmKkjSfYF/hh4DN3NtV+V5JFJHkp3O50nV9XvA3/Vf+Qc4LFV9Ui6G1EfVVVXAe8D/qmq9qmqs6fs5kTgb6rqEXS3Ijl26L0tqurRdKNxxzKaY4Fz+j6cBtx/xM9J2gxtMfMqkjSts4EjkuxNd9PeHZLcD3gccDjdzWr3Bb7d3RucrYAb6ILR3sC5fftC4Fvrsf8nAKdW1e0AST5LF/4KOKWqbgSoqtX9+rsDn+z7uBC4crqNJ9kO2L6qzuybTqC7V+KEz/bPFwCLR+zz/sAL+n6dnuQXI35O0mbIsCVpg1TVtf202oHAWcCOwCHAbVV1a7okdUJVvXH4c0kOBs6oqhdvYBfWNgUXusA11XuBd1fVaUmWAsdt4P7v6J/vZt1+p3pjWmlMOI0oaX3cCmw7tPwtumm0s+hGut7QPwN8DXhhkvsCJNkxyZ7AecB+SR7Ut987ye+sZfvTOQt4Xv/5rYHn9/v+GnBIkp0m9tuvvx1wbf/65dMcEwBVdTPwi6HzsV4KnDl1vXV0FnBY369nAjts4PYkzWGGLUnrrKp+Tjf9d2mSf6ALN1tU1WXAhXSjW2f3636P7tysryS5BDgDuF9V/Qx4BXBS334e8JB+Fx8A/mOUE+Sr6kLg34DzgeXAh6rqO1X1XeB44MwkFwPv7j9yHPDpJGcDNw5t6vPA8ydOkJ+ym5cD/9D3cx/grSOUaTpvAfZPciHwdODqDdyepDksVY5kS5IkteLIliRJUkOeIC9pTkuyHNhySvNLq2rlbPRnJkmeAbxjSvOVVfX82eiPpNnnNKIkSVJDTiNKkiQ1ZNiSJElqyLAlSZLUkGFLkiSpIcOWJElSQ/8fiIC7yWlTFDgAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 720x360 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#tweet_location定位\n",
    "df_tweet_location = df['tweet_location'].fillna('unkonwn')\n",
    "\n",
    "dic_tweet_location = []\n",
    "list_tweet_location = df_tweet_location.tolist()\n",
    "\n",
    "for i in list_tweet_location:\n",
    "    if i not in dic_tweet_location:\n",
    "        dic_tweet_location.append(i)\n",
    "        \n",
    "label_tweet_location = preprocessing.LabelEncoder()\n",
    "label_tweet_location.fit(dic_tweet_location)\n",
    "\n",
    "df_tweet_location_id_tmp = label_tweet_location.transform(df_tweet_location)\n",
    "df_tweet_location_id = pd.DataFrame(df_tweet_location_id_tmp, index=df.index, columns=['tweet_location_id'])\n",
    "\n",
    "df_tweet_location_id.drop_duplicates('tweet_location_id').count()\n",
    "#总类别个数约为1/3，进一步观测分布\n",
    "\n",
    "drawScatter(df_tweet_location_id,y,'tweet_location_id')\n",
    "\n",
    "#舍去该特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "#text与describe已经通过词的方式计算概率，另外部分特征进一步进行模型训练\n",
    "#取特征如下：\n",
    "# _golden 差异性很小，df值：df_golden\n",
    "# _trusted_judgments特征，差异小，除了默认的3，剩余50个非3的值为差异分布，df值：df['_trusted_judgments']\n",
    "# fav_number特征，存在一定程度上的全局差异，df值：df['fav_number']\n",
    "# link_color特征，存在部分分布差异，df值：df_link_color_id\n",
    "# retweet_count，相对显著，df值：df['retweet_count']\n",
    "# sidebar_color, 相对显著，df值: df_sidebar_color_id\n",
    "# tweet_count，相对显著，df值：df['tweet_count']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train : Text =>14127 : 4709\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>25%</th>\n",
       "      <th>50%</th>\n",
       "      <th>75%</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>_trusted_judgments</th>\n",
       "      <td>18836.0</td>\n",
       "      <td>3.655394</td>\n",
       "      <td>12.722085</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.00</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>274.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fav_number</th>\n",
       "      <td>18836.0</td>\n",
       "      <td>4413.461563</td>\n",
       "      <td>12468.532705</td>\n",
       "      <td>0.0</td>\n",
       "      <td>13.00</td>\n",
       "      <td>482.5</td>\n",
       "      <td>3375.5</td>\n",
       "      <td>341621.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>retweet_count</th>\n",
       "      <td>18836.0</td>\n",
       "      <td>0.082502</td>\n",
       "      <td>2.732317</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>330.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tweet_count</th>\n",
       "      <td>18836.0</td>\n",
       "      <td>39135.700680</td>\n",
       "      <td>119130.638890</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2399.75</td>\n",
       "      <td>11312.5</td>\n",
       "      <td>39793.5</td>\n",
       "      <td>2680199.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>_golden</th>\n",
       "      <td>18836.0</td>\n",
       "      <td>0.002654</td>\n",
       "      <td>0.051455</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>link_color_id</th>\n",
       "      <td>18836.0</td>\n",
       "      <td>816.489594</td>\n",
       "      <td>937.403143</td>\n",
       "      <td>0.0</td>\n",
       "      <td>76.00</td>\n",
       "      <td>76.0</td>\n",
       "      <td>1732.0</td>\n",
       "      <td>2881.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sidebar_color_id</th>\n",
       "      <td>18836.0</td>\n",
       "      <td>312.381822</td>\n",
       "      <td>183.906661</td>\n",
       "      <td>0.0</td>\n",
       "      <td>226.75</td>\n",
       "      <td>352.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>544.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      count          mean            std  min      25%  \\\n",
       "_trusted_judgments  18836.0      3.655394      12.722085  3.0     3.00   \n",
       "fav_number          18836.0   4413.461563   12468.532705  0.0    13.00   \n",
       "retweet_count       18836.0      0.082502       2.732317  0.0     0.00   \n",
       "tweet_count         18836.0  39135.700680  119130.638890  1.0  2399.75   \n",
       "_golden             18836.0      0.002654       0.051455  0.0     0.00   \n",
       "link_color_id       18836.0    816.489594     937.403143  0.0    76.00   \n",
       "sidebar_color_id    18836.0    312.381822     183.906661  0.0   226.75   \n",
       "\n",
       "                        50%      75%        max  \n",
       "_trusted_judgments      3.0      3.0      274.0  \n",
       "fav_number            482.5   3375.5   341621.0  \n",
       "retweet_count           0.0      0.0      330.0  \n",
       "tweet_count         11312.5  39793.5  2680199.0  \n",
       "_golden                 0.0      0.0        1.0  \n",
       "link_color_id          76.0   1732.0     2881.0  \n",
       "sidebar_color_id      352.0    405.0      544.0  "
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#拿到x特征集合\n",
    "df_x = pd.concat([df[['_trusted_judgments','fav_number','retweet_count','tweet_count']]\n",
    "                  ,df_golden\n",
    "                  ,df_link_color_id\n",
    "                  ,df_sidebar_color_id],axis=1)\n",
    "\n",
    "#数据切割\n",
    "x_train, x_test, y_train, y_test = train_test_split(df_x, y, test_size = 0.25)\n",
    "print(f'Train : Text =>{y_train.count()} : {y_test.count()}')\n",
    "\n",
    "df_x.describe().T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 18836 entries, 0 to 18835\n",
      "Data columns (total 7 columns):\n",
      "_trusted_judgments    18836 non-null int64\n",
      "fav_number            18836 non-null int64\n",
      "retweet_count         18836 non-null int64\n",
      "tweet_count           18836 non-null int64\n",
      "_golden               18836 non-null int64\n",
      "link_color_id         18836 non-null int64\n",
      "sidebar_color_id      18836 non-null int64\n",
      "dtypes: int64(7)\n",
      "memory usage: 1.0 MB\n",
      "[8.64891987e+01 1.13400454e+07 1.88537431e+02 9.95882595e+07\n",
      " 1.89903792e+00 6.14738280e+05 4.28232801e+02]\n",
      "[1.65618612e-19 0.00000000e+00 1.14714140e-41 0.00000000e+00\n",
      " 3.86927106e-01 0.00000000e+00 1.02430383e-93]\n",
      "tweet_count - y (pearsonr):(0.11123466734351128, 6.275174261723037e-53)\n",
      "retweet_count - y (pearsonr):(0.010437521513536177, 0.15201955024839234)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_selection import SelectKBest\n",
    "from sklearn.feature_selection import chi2\n",
    "from scipy.stats import pearsonr\n",
    "\n",
    "model_f = SelectKBest(chi2, k=3)\n",
    "model_f.fit_transform(df_x, y)\n",
    "\n",
    "df_x.info()\n",
    "\n",
    "print(model_f.scores_)\n",
    "print(model_f.pvalues_)\n",
    "print(f\"tweet_count - y (pearsonr):{pearsonr(df_x['tweet_count'],y)}\")\n",
    "print(f\"retweet_count - y (pearsonr):{pearsonr(df_x['retweet_count'],y)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "confusion_matrix: \n",
      "[[876 582 216]\n",
      " [480 805 237]\n",
      " [226 368 919]]\n",
      "0-precision: 0.5537294563843237\n",
      "0-recall: 0.5232974910394266\n",
      "1-precision: 0.4586894586894587\n",
      "1-recall: 0.5289093298291722\n",
      "2-precision: 0.6698250728862973\n",
      "2-recall: 0.6074025115664243\n",
      "avg-precison: 0.5607479959866932\n",
      "avg-recall: 0.5532031108116744\n",
      "accuracy: 0.552134211085156\n"
     ]
    }
   ],
   "source": [
    "#使用万能的xgboost算法，conda安装xgboost：conda install -c  anaconda py-xgboost\n",
    "from xgboost import XGBClassifier\n",
    "\n",
    "xgb = XGBClassifier()\n",
    "\n",
    "xgb.fit(x_train,y_train)\n",
    "\n",
    "y_predict = xgb.predict(x_test)\n",
    "\n",
    "other_confusion = confusion_matrix(y_test, y_predict,labels=[0,1,2])\n",
    "print(f'confusion_matrix: \\n{other_confusion}')\n",
    "\n",
    "#打印各个类别的precision和recall\n",
    "print_score(y_test, y_predict, other_confusion)\n",
    "\n",
    "#效果太差，整体结果比乱猜稍微好点，比text的结果差"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "confusion_matrix: \n",
      "[[1151  145  378]\n",
      " [ 941  175  406]\n",
      " [ 558   64  891]]\n",
      "0-precision: 0.43433962264150944\n",
      "0-recall: 0.6875746714456392\n",
      "1-precision: 0.4557291666666667\n",
      "1-recall: 0.11498028909329829\n",
      "2-precision: 0.5319402985074627\n",
      "2-recall: 0.5888962326503635\n",
      "avg-precison: 0.4740030292718796\n",
      "avg-recall: 0.46381706439643366\n",
      "accuracy: 0.4708005946060735\n"
     ]
    }
   ],
   "source": [
    "#尝试更换算法模型\n",
    "other_nb = MultinomialNB()\n",
    "other_nb.fit(x_train, y_train)\n",
    "y_predict = other_nb.predict(x_test)\n",
    "\n",
    "other_confusion = confusion_matrix(y_test, y_predict,labels=[0,1,2])\n",
    "print(f'confusion_matrix: \\n{other_confusion}')\n",
    "\n",
    "print_score(y_test,y_predict, other_confusion)\n",
    "\n",
    "#多项式分布的NB，对于male类型的分类简直是灾难"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "confusion_matrix: \n",
      "[[1561   80   33]\n",
      " [1407   81   34]\n",
      " [1178  241   94]]\n",
      "0-precision: 0.3765074770863483\n",
      "0-recall: 0.9324970131421745\n",
      "1-precision: 0.20149253731343283\n",
      "1-recall: 0.053219448094612355\n",
      "2-precision: 0.5838509316770186\n",
      "2-recall: 0.06212822207534699\n",
      "avg-precison: 0.3872836486922666\n",
      "avg-recall: 0.34928156110404457\n",
      "accuracy: 0.3686557655553196\n"
     ]
    }
   ],
   "source": [
    "#更换NB的分布\n",
    "#尝试更换算法模型,先更改朴素贝叶斯的分布\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "\n",
    "other_nb = GaussianNB()\n",
    "other_nb.fit(x_train, y_train)\n",
    "y_predict = other_nb.predict(x_test)\n",
    "\n",
    "other_confusion = confusion_matrix(y_test, y_predict,labels=[0,1,2])\n",
    "print(f'confusion_matrix: \\n{other_confusion}')\n",
    "\n",
    "print_score(y_test,y_predict, other_confusion)\n",
    "\n",
    "#整体的结果预测，都偏向于female，整体结果太差"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "confusion_matrix: \n",
      "[[1531   48   95]\n",
      " [1387   62   73]\n",
      " [ 898   38  577]]\n",
      "0-precision: 0.40120545073375263\n",
      "0-recall: 0.9145758661887694\n",
      "1-precision: 0.4189189189189189\n",
      "1-recall: 0.040735873850197106\n",
      "2-precision: 0.774496644295302\n",
      "2-recall: 0.3813615333773959\n",
      "avg-precison: 0.5315403379826579\n",
      "avg-recall: 0.44555775780545415\n",
      "accuracy: 0.4608197069441495\n"
     ]
    }
   ],
   "source": [
    "#更换NB的分布\n",
    "#尝试更换算法模型,先更改朴素贝叶斯的分布\n",
    "from sklearn.naive_bayes import BernoulliNB\n",
    "\n",
    "other_nb = BernoulliNB()\n",
    "other_nb.fit(x_train, y_train)\n",
    "y_predict = other_nb.predict(x_test)\n",
    "\n",
    "other_confusion = confusion_matrix(y_test, y_predict,labels=[0,1,2])\n",
    "print(f'confusion_matrix: \\n{other_confusion}')\n",
    "\n",
    "print_score(y_test,y_predict, other_confusion)\n",
    "\n",
    "#整体的结果预测，都偏向于female，整体结果太差, male类型几乎无法识别"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "confusion_matrix: \n",
      "[[1612   29   33]\n",
      " [1400   86   36]\n",
      " [1133   37  343]]\n",
      "0-precision: 0.3889022919179735\n",
      "0-recall: 0.9629629629629629\n",
      "1-precision: 0.5657894736842105\n",
      "1-recall: 0.056504599211563734\n",
      "2-precision: 0.8325242718446602\n",
      "2-recall: 0.2267019167217449\n",
      "avg-precison: 0.5957386791489481\n",
      "avg-recall: 0.4153898262987572\n",
      "accuracy: 0.43342535570184754\n"
     ]
    }
   ],
   "source": [
    "#使用SVM\n",
    "from sklearn.svm import SVC\n",
    "\n",
    "other_svc = SVC()\n",
    "other_svc.fit(x_train, y_train)\n",
    "y_predict = other_svc.predict(x_test)\n",
    "\n",
    "other_confusion = confusion_matrix(y_test, y_predict,labels=[0,1,2])\n",
    "print(f'confusion_matrix: \\n{other_confusion}')\n",
    "\n",
    "print_score(y_test,y_predict, other_confusion)\n",
    "\n",
    "#SVM的训练时间相对较长，整体结果偏向于female，识别度不高"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "confusion_matrix: \n",
      "[[979 340 355]\n",
      " [693 349 480]\n",
      " [477 112 924]]\n",
      "0-precision: 0.45556072591903213\n",
      "0-recall: 0.5848267622461171\n",
      "1-precision: 0.43570536828963796\n",
      "1-recall: 0.22930354796320632\n",
      "2-precision: 0.5252984650369528\n",
      "2-recall: 0.6107072042300066\n",
      "avg-precison: 0.4721881864152076\n",
      "avg-recall: 0.4749458381464433\n",
      "accuracy: 0.4782331705245275\n"
     ]
    }
   ],
   "source": [
    "#使用LR\n",
    "from sklearn.linear_model  import LogisticRegressionCV\n",
    "\n",
    "other_lr = LogisticRegressionCV()\n",
    "other_lr.fit(x_train, y_train)\n",
    "y_predict = other_lr.predict(x_test)\n",
    "\n",
    "other_confusion = confusion_matrix(y_test, y_predict,labels=[0,1,2])\n",
    "print(f'confusion_matrix: \\n{other_confusion}')\n",
    "\n",
    "print_score(y_test,y_predict, other_confusion)\n",
    "\n",
    "#LRCV的结果，整体结果一般，male误判为female较多"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "confusion_matrix: \n",
      "[[935 491 248]\n",
      " [599 678 245]\n",
      " [325 336 852]]\n",
      "0-precision: 0.5029585798816568\n",
      "0-recall: 0.5585424133811231\n",
      "1-precision: 0.4504983388704319\n",
      "1-recall: 0.4454664914586071\n",
      "2-precision: 0.633457249070632\n",
      "2-recall: 0.5631196298744217\n",
      "avg-precison: 0.5289713892742403\n",
      "avg-recall: 0.5223761782380506\n",
      "accuracy: 0.5234657039711191\n"
     ]
    }
   ],
   "source": [
    "#使用RandomForestC\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "other_rfc = RandomForestClassifier()\n",
    "other_rfc.fit(x_train, y_train)\n",
    "y_predict = other_rfc.predict(x_test)\n",
    "\n",
    "other_confusion = confusion_matrix(y_test, y_predict,labels=[0,1,2])\n",
    "print(f'confusion_matrix: \\n{other_confusion}')\n",
    "\n",
    "print_score(y_test,y_predict, other_confusion)\n",
    "\n",
    "#整体结果一般，male表现一直较差"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "######开启参数调优之路\n",
    "# + text相关的，说明text特征是有效的，可以更换算法尝试一下\n",
    "# + 通text的角度看,count的特征处理+MultinomialNB的方式是text中的最佳解\n",
    "# + 常规特征中,XGBClassifier和RandomForestClassifier可以进行参数调优"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "###对于text的结果进行参数调优\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "\n",
    "x_train, x_test, y_train, y_test = train_test_split(x_text_count, y, test_size = 0.35)\n",
    "\n",
    "model = MultinomialNB()\n",
    "\n",
    "alpha = [0.9,2.0]\n",
    "fit_prior = [True]\n",
    "\n",
    "param_grid = dict(alpha = alpha, fit_prior = fit_prior)\n",
    "\n",
    "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)\n",
    "grid_search = GridSearchCV(model, param_grid, scoring='accuracy', n_jobs = 3, cv = kfold)\n",
    "\n",
    "grid_result = grid_search.fit(x_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best: 0.620600 using {'alpha': 2.0, 'fit_prior': True}\n",
      "0.615944  with:   {'alpha': 0.9, 'fit_prior': True}\n",
      "0.620600  with:   {'alpha': 2.0, 'fit_prior': True}\n",
      "confusion_matrix: \n",
      "[[1692  468  214]\n",
      " [ 799 1098  252]\n",
      " [ 321  291 1458]]\n",
      "0-precision: 0.6017069701280228\n",
      "0-recall: 0.7127211457455771\n",
      "1-precision: 0.5912762520193862\n",
      "1-recall: 0.5109353187529083\n",
      "2-precision: 0.7577962577962578\n",
      "2-recall: 0.7043478260869566\n",
      "avg-precison: 0.6502598266478888\n",
      "avg-recall: 0.642668096861814\n",
      "accuracy: 0.6443197330502047\n"
     ]
    }
   ],
   "source": [
    "print(\"Best: %f using %s\" % (grid_result.best_score_,grid_search.best_params_))\n",
    "\n",
    "means = grid_result.cv_results_['mean_test_score']\n",
    "params = grid_result.cv_results_['params']\n",
    "for mean,param in zip(means,params):\n",
    "    print(\"%f  with:   %r\" % (mean,param))\n",
    "    \n",
    "y_predict = grid_search.predict(x_test)\n",
    "\n",
    "other_confusion = confusion_matrix(y_test, y_predict,labels=[0,1,2])\n",
    "print(f'confusion_matrix: \\n{other_confusion}')\n",
    "\n",
    "print_score(y_test,y_predict, other_confusion)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "###常规算法中,xgboost,RandomForestClassifier进行参数调优\n",
    "#拿到x特征集合\n",
    "# df_x = pd.concat([df[['_trusted_judgments','fav_number','retweet_count','tweet_count']]\n",
    "#                   ,df_golden\n",
    "#                   ,df_link_color_id\n",
    "#                   ,df_sidebar_color_id],axis=1)\n",
    "from xgboost import XGBClassifier\n",
    "###对于text的结果进行参数调优\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "\n",
    "x_train, x_test, y_train, y_test = train_test_split(df_x, y, test_size = 0.35)\n",
    "\n",
    "model = XGBClassifier()\n",
    "\n",
    "n_estimatores = [50,100]\n",
    "max_depth = [8,12]\n",
    "subsample = [0.75]\n",
    "colsample_bytree = [0.75]\n",
    "learning_rate = [0.1]\n",
    "objective = ['multi：softmax  num_class=3']\n",
    "gamma = [0, 0.2]\n",
    "\n",
    "param_grid = dict(n_estimatores = n_estimatores\n",
    "                  ,max_depth = max_depth\n",
    "                  ,subsample = subsample\n",
    "                  ,colsample_bytree = colsample_bytree\n",
    "                  ,learning_rate = learning_rate\n",
    "                  ,objective = objective\n",
    "                  ,gamma = gamma)\n",
    "\n",
    "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)\n",
    "grid_search = GridSearchCV(model, param_grid, scoring='accuracy', n_jobs = 3, cv = kfold)\n",
    "\n",
    "grid_result = grid_search.fit(x_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best: 0.546516 using {'colsample_bytree': 0.75, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 8, 'n_estimatores': 50, 'objective': 'multi：softmax  num_class=3', 'subsample': 0.75}\n",
      "confusion_matrix: \n",
      "[[1264  808  286]\n",
      " [ 706 1094  337]\n",
      " [ 366  496 1236]]\n",
      "0-precision: 0.541095890410959\n",
      "0-recall: 0.536047497879559\n",
      "1-precision: 0.4562135112593828\n",
      "1-recall: 0.5119326158165652\n",
      "2-precision: 0.664873587950511\n",
      "2-recall: 0.5891325071496664\n",
      "avg-precison: 0.5540609965402843\n",
      "avg-recall: 0.5457042069485968\n",
      "accuracy: 0.5451236159563173\n"
     ]
    }
   ],
   "source": [
    "print(\"Best: %f using %s\" % (grid_result.best_score_,grid_search.best_params_))\n",
    "\n",
    "means = grid_result.cv_results_['mean_test_score']\n",
    "params = grid_result.cv_results_['params']\n",
    "# for mean,param in zip(means,params):\n",
    "#     print(\"%f  with:   %r\" % (mean,param))\n",
    "    \n",
    "y_predict = grid_search.predict(x_test)\n",
    "\n",
    "other_confusion = confusion_matrix(y_test, y_predict,labels=[0,1,2])\n",
    "print(f'confusion_matrix: \\n{other_confusion}')\n",
    "\n",
    "print_score(y_test,y_predict, other_confusion)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "##randomforest参数调优\n",
    "x_train, x_test, y_train, y_test = train_test_split(df_x, y, test_size = 0.35)\n",
    "\n",
    "model = RandomForestClassifier()\n",
    "\n",
    "n_estimators = [200,300]\n",
    "max_depth = [8,12]\n",
    "criterion = ['entropy','gini']\n",
    "\n",
    "param_grid = dict(n_estimators = n_estimators\n",
    "                  ,max_depth = max_depth\n",
    "                  ,criterion = criterion)\n",
    "\n",
    "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)\n",
    "grid_search = GridSearchCV(model, param_grid, scoring='accuracy', n_jobs = 3, cv = kfold)\n",
    "\n",
    "grid_result = grid_search.fit(x_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best: 0.542596 using {'criterion': 'gini', 'max_depth': 12, 'n_estimators': 300}\n",
      "confusion_matrix: \n",
      "[[1312  657  311]\n",
      " [ 766 1087  311]\n",
      " [ 362  539 1248]]\n",
      "0-precision: 0.5377049180327869\n",
      "0-recall: 0.5754385964912281\n",
      "1-precision: 0.47612790188348664\n",
      "1-recall: 0.5023105360443623\n",
      "2-precision: 0.667379679144385\n",
      "2-recall: 0.5807352256863657\n",
      "avg-precison: 0.5604041663535528\n",
      "avg-recall: 0.5528281194073187\n",
      "accuracy: 0.5531624450174427\n"
     ]
    }
   ],
   "source": [
    "print(\"Best: %f using %s\" % (grid_result.best_score_,grid_search.best_params_))\n",
    "\n",
    "means = grid_result.cv_results_['mean_test_score']\n",
    "params = grid_result.cv_results_['params']\n",
    "# for mean,param in zip(means,params):\n",
    "#     print(\"%f  with:   %r\" % (mean,param))\n",
    "    \n",
    "y_predict = grid_search.predict(x_test)\n",
    "\n",
    "other_confusion = confusion_matrix(y_test, y_predict,labels=[0,1,2])\n",
    "print(f'confusion_matrix: \\n{other_confusion}')\n",
    "\n",
    "print_score(y_test,y_predict, other_confusion)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
